Подключение необходимых библиотек¶
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import numpy as np
from datetime import date
Загрузка данных¶
train_df = pd.read_csv('train.csv')
train_df.head()
| id | timestamp | full_sq | life_sq | floor | max_floor | material | build_year | num_room | kitch_sq | state | product_type | sub_area | area_m | raion_popul | ... | cafe_avg_price_5000 | cafe_count_5000_na_price | cafe_count_5000_price_500 | cafe_count_5000_price_1000 | cafe_count_5000_price_1500 | cafe_count_5000_price_2500 | cafe_count_5000_price_4000 | cafe_count_5000_price_high | big_church_count_5000 | church_count_5000 | mosque_count_5000 | leisure_count_5000 | sport_count_5000 | market_count_5000 | price_doc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2011-08-20 | 43 | 27.0 | 4.0 | NaN | NaN | NaN | NaN | NaN | NaN | Investment | Bibirevo | 6.407578e+06 | 155572 | ... | 947.14 | 12 | 39 | 48 | 40 | 9 | 4 | 0 | 13 | 22 | 1 | 0 | 52 | 4 | 5850000 |
| 1 | 2 | 2011-08-23 | 34 | 19.0 | 3.0 | NaN | NaN | NaN | NaN | NaN | NaN | Investment | Nagatinskij Zaton | 9.589337e+06 | 115352 | ... | 911.31 | 9 | 49 | 65 | 36 | 15 | 3 | 0 | 15 | 29 | 1 | 10 | 66 | 14 | 6000000 |
| 2 | 3 | 2011-08-27 | 43 | 29.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | Investment | Tekstil'shhiki | 4.808270e+06 | 101708 | ... | 949.55 | 10 | 29 | 45 | 25 | 10 | 3 | 0 | 11 | 27 | 0 | 4 | 67 | 10 | 5700000 |
| 3 | 4 | 2011-09-01 | 89 | 50.0 | 9.0 | NaN | NaN | NaN | NaN | NaN | NaN | Investment | Mitino | 1.258354e+07 | 178473 | ... | 1242.11 | 4 | 7 | 21 | 15 | 11 | 2 | 1 | 4 | 4 | 0 | 0 | 26 | 3 | 13100000 |
| 4 | 5 | 2011-09-05 | 77 | 77.0 | 4.0 | NaN | NaN | NaN | NaN | NaN | NaN | Investment | Basmannoe | 8.398461e+06 | 108171 | ... | 1132.66 | 143 | 566 | 578 | 552 | 319 | 108 | 17 | 135 | 236 | 2 | 91 | 195 | 14 | 16331452 |
5 rows × 292 columns
Краткая информация о данных¶
train_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 30471 entries, 0 to 30470 Columns: 292 entries, id to price_doc dtypes: float64(119), int64(157), object(16) memory usage: 67.9+ MB
train_df.describe()
| id | full_sq | life_sq | floor | max_floor | material | build_year | num_room | kitch_sq | state | area_m | raion_popul | green_zone_part | indust_part | children_preschool | ... | cafe_avg_price_5000 | cafe_count_5000_na_price | cafe_count_5000_price_500 | cafe_count_5000_price_1000 | cafe_count_5000_price_1500 | cafe_count_5000_price_2500 | cafe_count_5000_price_4000 | cafe_count_5000_price_high | big_church_count_5000 | church_count_5000 | mosque_count_5000 | leisure_count_5000 | sport_count_5000 | market_count_5000 | price_doc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 30471.000000 | 30471.000000 | 24088.000000 | 30304.000000 | 20899.000000 | 20899.000000 | 1.686600e+04 | 20899.000000 | 20899.000000 | 16912.000000 | 3.047100e+04 | 30471.000000 | 30471.000000 | 30471.000000 | 30471.000000 | ... | 30174.000000 | 30471.000000 | 30471.000000 | 30471.000000 | 30471.000000 | 30471.000000 | 30471.000000 | 30471.000000 | 30471.000000 | 30471.000000 | 30471.000000 | 30471.000000 | 30471.000000 | 30471.000000 | 3.047100e+04 |
| mean | 15237.917397 | 54.214269 | 34.403271 | 7.670803 | 12.558974 | 1.827121 | 3.068057e+03 | 1.909804 | 6.399301 | 2.107025 | 1.765705e+07 | 84056.425552 | 0.218922 | 0.118871 | 5140.026156 | ... | 1021.689513 | 17.806898 | 66.195530 | 73.442421 | 63.469660 | 32.058318 | 10.783860 | 1.771783 | 15.045552 | 30.251518 | 0.442421 | 8.648814 | 52.796593 | 5.987070 | 7.123035e+06 |
| std | 8796.501536 | 38.031487 | 52.285733 | 5.319989 | 6.756550 | 1.481154 | 1.543878e+05 | 0.851805 | 28.265979 | 0.880148 | 2.064961e+07 | 57871.285899 | 0.175090 | 0.118688 | 3816.625140 | ... | 194.117696 | 33.269057 | 125.934584 | 126.167671 | 124.076662 | 73.465611 | 28.385679 | 5.418807 | 29.118668 | 47.347938 | 0.609269 | 20.580741 | 46.292660 | 4.889219 | 4.780111e+06 |
| min | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000e+00 | 0.000000 | 0.000000 | 1.000000 | 2.081628e+06 | 2546.000000 | 0.001879 | 0.000000 | 175.000000 | ... | 400.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+05 |
| 25% | 7620.500000 | 38.000000 | 20.000000 | 3.000000 | 9.000000 | 1.000000 | 1.967000e+03 | 1.000000 | 1.000000 | 1.000000 | 7.307411e+06 | 21819.000000 | 0.063755 | 0.019509 | 1706.000000 | ... | 909.380000 | 1.000000 | 4.000000 | 8.000000 | 6.000000 | 2.000000 | 1.000000 | 0.000000 | 2.000000 | 9.000000 | 0.000000 | 0.000000 | 11.000000 | 1.000000 | 4.740002e+06 |
| 50% | 15238.000000 | 49.000000 | 30.000000 | 6.500000 | 12.000000 | 1.000000 | 1.979000e+03 | 2.000000 | 6.000000 | 2.000000 | 1.050803e+07 | 83502.000000 | 0.167526 | 0.072158 | 4857.000000 | ... | 966.670000 | 8.000000 | 28.000000 | 36.000000 | 24.000000 | 8.000000 | 2.000000 | 0.000000 | 7.000000 | 16.000000 | 0.000000 | 2.000000 | 48.000000 | 5.000000 | 6.274411e+06 |
| 75% | 22855.500000 | 63.000000 | 43.000000 | 11.000000 | 17.000000 | 2.000000 | 2.005000e+03 | 2.000000 | 9.000000 | 3.000000 | 1.803644e+07 | 122862.000000 | 0.336177 | 0.195781 | 7103.000000 | ... | 1091.670000 | 15.000000 | 59.000000 | 69.000000 | 51.000000 | 21.000000 | 5.000000 | 1.000000 | 12.000000 | 28.000000 | 1.000000 | 7.000000 | 76.000000 | 10.000000 | 8.300000e+06 |
| max | 30473.000000 | 5326.000000 | 7478.000000 | 77.000000 | 117.000000 | 6.000000 | 2.005201e+07 | 19.000000 | 2014.000000 | 33.000000 | 2.060718e+08 | 247469.000000 | 0.852923 | 0.521867 | 19223.000000 | ... | 2437.500000 | 174.000000 | 650.000000 | 648.000000 | 641.000000 | 377.000000 | 147.000000 | 30.000000 | 151.000000 | 250.000000 | 2.000000 | 106.000000 | 218.000000 | 21.000000 | 1.111111e+08 |
8 rows × 276 columns
Просмотр всех столбцов¶
print(list(train_df.columns))
['id', 'timestamp', 'full_sq', 'life_sq', 'floor', 'max_floor', 'material', 'build_year', 'num_room', 'kitch_sq', 'state', 'product_type', 'sub_area', 'area_m', 'raion_popul', 'green_zone_part', 'indust_part', 'children_preschool', 'preschool_quota', 'preschool_education_centers_raion', 'children_school', 'school_quota', 'school_education_centers_raion', 'school_education_centers_top_20_raion', 'hospital_beds_raion', 'healthcare_centers_raion', 'university_top_20_raion', 'sport_objects_raion', 'additional_education_raion', 'culture_objects_top_25', 'culture_objects_top_25_raion', 'shopping_centers_raion', 'office_raion', 'thermal_power_plant_raion', 'incineration_raion', 'oil_chemistry_raion', 'radiation_raion', 'railroad_terminal_raion', 'big_market_raion', 'nuclear_reactor_raion', 'detention_facility_raion', 'full_all', 'male_f', 'female_f', 'young_all', 'young_male', 'young_female', 'work_all', 'work_male', 'work_female', 'ekder_all', 'ekder_male', 'ekder_female', '0_6_all', '0_6_male', '0_6_female', '7_14_all', '7_14_male', '7_14_female', '0_17_all', '0_17_male', '0_17_female', '16_29_all', '16_29_male', '16_29_female', '0_13_all', '0_13_male', '0_13_female', 'raion_build_count_with_material_info', 'build_count_block', 'build_count_wood', 'build_count_frame', 'build_count_brick', 'build_count_monolith', 'build_count_panel', 'build_count_foam', 'build_count_slag', 'build_count_mix', 'raion_build_count_with_builddate_info', 'build_count_before_1920', 'build_count_1921-1945', 'build_count_1946-1970', 'build_count_1971-1995', 'build_count_after_1995', 'ID_metro', 'metro_min_avto', 'metro_km_avto', 'metro_min_walk', 'metro_km_walk', 'kindergarten_km', 'school_km', 'park_km', 'green_zone_km', 'industrial_km', 'water_treatment_km', 'cemetery_km', 'incineration_km', 'railroad_station_walk_km', 'railroad_station_walk_min', 'ID_railroad_station_walk', 'railroad_station_avto_km', 'railroad_station_avto_min', 'ID_railroad_station_avto', 'public_transport_station_km', 'public_transport_station_min_walk', 'water_km', 'water_1line', 'mkad_km', 'ttk_km', 'sadovoe_km', 'bulvar_ring_km', 'kremlin_km', 'big_road1_km', 'ID_big_road1', 'big_road1_1line', 'big_road2_km', 'ID_big_road2', 'railroad_km', 'railroad_1line', 'zd_vokzaly_avto_km', 'ID_railroad_terminal', 'bus_terminal_avto_km', 'ID_bus_terminal', 'oil_chemistry_km', 'nuclear_reactor_km', 'radiation_km', 'power_transmission_line_km', 'thermal_power_plant_km', 'ts_km', 'big_market_km', 'market_shop_km', 'fitness_km', 'swim_pool_km', 'ice_rink_km', 'stadium_km', 'basketball_km', 'hospice_morgue_km', 'detention_facility_km', 'public_healthcare_km', 'university_km', 'workplaces_km', 'shopping_centers_km', 'office_km', 'additional_education_km', 'preschool_km', 'big_church_km', 'church_synagogue_km', 'mosque_km', 'theater_km', 'museum_km', 'exhibition_km', 'catering_km', 'ecology', 'green_part_500', 'prom_part_500', 'office_count_500', 'office_sqm_500', 'trc_count_500', 'trc_sqm_500', 'cafe_count_500', 'cafe_sum_500_min_price_avg', 'cafe_sum_500_max_price_avg', 'cafe_avg_price_500', 'cafe_count_500_na_price', 'cafe_count_500_price_500', 'cafe_count_500_price_1000', 'cafe_count_500_price_1500', 'cafe_count_500_price_2500', 'cafe_count_500_price_4000', 'cafe_count_500_price_high', 'big_church_count_500', 'church_count_500', 'mosque_count_500', 'leisure_count_500', 'sport_count_500', 'market_count_500', 'green_part_1000', 'prom_part_1000', 'office_count_1000', 'office_sqm_1000', 'trc_count_1000', 'trc_sqm_1000', 'cafe_count_1000', 'cafe_sum_1000_min_price_avg', 'cafe_sum_1000_max_price_avg', 'cafe_avg_price_1000', 'cafe_count_1000_na_price', 'cafe_count_1000_price_500', 'cafe_count_1000_price_1000', 'cafe_count_1000_price_1500', 'cafe_count_1000_price_2500', 'cafe_count_1000_price_4000', 'cafe_count_1000_price_high', 'big_church_count_1000', 'church_count_1000', 'mosque_count_1000', 'leisure_count_1000', 'sport_count_1000', 'market_count_1000', 'green_part_1500', 'prom_part_1500', 'office_count_1500', 'office_sqm_1500', 'trc_count_1500', 'trc_sqm_1500', 'cafe_count_1500', 'cafe_sum_1500_min_price_avg', 'cafe_sum_1500_max_price_avg', 'cafe_avg_price_1500', 'cafe_count_1500_na_price', 'cafe_count_1500_price_500', 'cafe_count_1500_price_1000', 'cafe_count_1500_price_1500', 'cafe_count_1500_price_2500', 'cafe_count_1500_price_4000', 'cafe_count_1500_price_high', 'big_church_count_1500', 'church_count_1500', 'mosque_count_1500', 'leisure_count_1500', 'sport_count_1500', 'market_count_1500', 'green_part_2000', 'prom_part_2000', 'office_count_2000', 'office_sqm_2000', 'trc_count_2000', 'trc_sqm_2000', 'cafe_count_2000', 'cafe_sum_2000_min_price_avg', 'cafe_sum_2000_max_price_avg', 'cafe_avg_price_2000', 'cafe_count_2000_na_price', 'cafe_count_2000_price_500', 'cafe_count_2000_price_1000', 'cafe_count_2000_price_1500', 'cafe_count_2000_price_2500', 'cafe_count_2000_price_4000', 'cafe_count_2000_price_high', 'big_church_count_2000', 'church_count_2000', 'mosque_count_2000', 'leisure_count_2000', 'sport_count_2000', 'market_count_2000', 'green_part_3000', 'prom_part_3000', 'office_count_3000', 'office_sqm_3000', 'trc_count_3000', 'trc_sqm_3000', 'cafe_count_3000', 'cafe_sum_3000_min_price_avg', 'cafe_sum_3000_max_price_avg', 'cafe_avg_price_3000', 'cafe_count_3000_na_price', 'cafe_count_3000_price_500', 'cafe_count_3000_price_1000', 'cafe_count_3000_price_1500', 'cafe_count_3000_price_2500', 'cafe_count_3000_price_4000', 'cafe_count_3000_price_high', 'big_church_count_3000', 'church_count_3000', 'mosque_count_3000', 'leisure_count_3000', 'sport_count_3000', 'market_count_3000', 'green_part_5000', 'prom_part_5000', 'office_count_5000', 'office_sqm_5000', 'trc_count_5000', 'trc_sqm_5000', 'cafe_count_5000', 'cafe_sum_5000_min_price_avg', 'cafe_sum_5000_max_price_avg', 'cafe_avg_price_5000', 'cafe_count_5000_na_price', 'cafe_count_5000_price_500', 'cafe_count_5000_price_1000', 'cafe_count_5000_price_1500', 'cafe_count_5000_price_2500', 'cafe_count_5000_price_4000', 'cafe_count_5000_price_high', 'big_church_count_5000', 'church_count_5000', 'mosque_count_5000', 'leisure_count_5000', 'sport_count_5000', 'market_count_5000', 'price_doc']
Самые важные характеристики для оценки недвижимости¶
1. Характеристики объекта:
- Общая площадь:
full_sqм² - Жилая площадь:
life_sqм² - Этаж:
floorизmax_floor - Материал:
material - Год постройки:
build_year - Количество комнат:
num_room - Площадь кухни:
kitch_sqм² - Состояние:
state - Дата постройки:
build_year
2. Характеристики местоположения:
- Район:
sub_area - Время до метро на машине:
metro_min_avtoмин. - Расстояние до метро на машине:
metro_km_avtoкм - Время до метро пешком:
metro_min_walkмин. - Расстояние до детского сада:
kindergarten_kmкм - Расстояние до школы:
school_kmкм - Расстояние до парка:
park_kmкм - Расстояние до МКАД:
mkad_kmкм - Расстояние до ТТК:
ttk_kmкм - Расстояние до Садового кольца:
sadovoe_kmкм - Расстояние до Бульварного кольца:
bulvar_ring_kmкм - Расстояние до Кремля:
kremlin_kmкм
3. Характеристики окружения:
- Население:
full_allчеловек - Доля зеленых зон:
green_zone_part% - Доля промышленных зон:
indust_part% - Количество мест в дошкольных учреждениях:
preschool_quota - Количество мест в школах:
school_quota - Количество больничных коек на 1000 человек:
hospital_beds_raion
Целевая переменная¶
Стоимость недвижимости: price_doc
Характеристики транзакции¶
Дата: timestamp
important_columns = [
# Характеристики объекта
"full_sq",
"life_sq",
"floor",
"material",
"max_floor",
"build_year",
"num_room",
"kitch_sq",
"state",
'build_year',
# Характеристики местоположения
"sub_area",
"metro_min_avto",
"metro_km_avto",
"metro_min_walk",
"kindergarten_km",
"school_km",
"park_km",
"mkad_km",
"ttk_km",
"sadovoe_km",
"bulvar_ring_km",
"kremlin_km",
# Характеристики окружения
"full_all",
"green_zone_part",
"indust_part",
"preschool_quota",
"school_quota",
"hospital_beds_raion",
#Целеая переменная
"price_doc",
#Дата транзакции
"timestamp"
]
train_df = train_df.filter(items=important_columns)
train_df
| full_sq | life_sq | floor | material | max_floor | build_year | num_room | kitch_sq | state | sub_area | metro_min_avto | metro_km_avto | metro_min_walk | kindergarten_km | school_km | park_km | mkad_km | ttk_km | sadovoe_km | bulvar_ring_km | kremlin_km | full_all | green_zone_part | indust_part | preschool_quota | school_quota | hospital_beds_raion | price_doc | timestamp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 43 | 27.0 | 4.0 | NaN | NaN | NaN | NaN | NaN | NaN | Bibirevo | 2.590241 | 1.131260 | 13.575119 | 0.145700 | 0.177975 | 2.158587 | 1.422391 | 10.918587 | 13.100618 | 13.675657 | 15.156211 | 86206 | 0.189727 | 0.000070 | 5001.0 | 11065.0 | 240.0 | 5850000 | 2011-08-20 |
| 1 | 34 | 19.0 | 3.0 | NaN | NaN | NaN | NaN | NaN | NaN | Nagatinskij Zaton | 0.936700 | 0.647337 | 7.620630 | 0.147754 | 0.273345 | 0.550690 | 9.503405 | 3.103996 | 6.444333 | 8.132640 | 8.698054 | 76284 | 0.372602 | 0.049637 | 3119.0 | 6237.0 | 229.0 | 6000000 | 2011-08-23 |
| 2 | 43 | 29.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | Tekstil'shhiki | 2.120999 | 1.637996 | 17.351515 | 0.049102 | 0.158072 | 0.374848 | 5.604800 | 2.927487 | 6.963403 | 8.054252 | 9.067885 | 101982 | 0.112560 | 0.118537 | 1463.0 | 5580.0 | 1183.0 | 5700000 | 2011-08-27 |
| 3 | 89 | 50.0 | 9.0 | NaN | NaN | NaN | NaN | NaN | NaN | Mitino | 1.489049 | 0.984537 | 11.565624 | 0.179441 | 0.236455 | 0.078090 | 2.677824 | 14.606501 | 17.457198 | 18.309433 | 19.487005 | 21155 | 0.194703 | 0.069753 | 6839.0 | 17063.0 | NaN | 13100000 | 2011-09-01 |
| 4 | 77 | 77.0 | 4.0 | NaN | NaN | NaN | NaN | NaN | NaN | Basmannoe | 1.257186 | 0.876620 | 8.266305 | 0.247901 | 0.376838 | 0.258289 | 11.616653 | 1.721834 | 0.046810 | 0.787593 | 2.578671 | 28179 | 0.015234 | 0.037316 | 3240.0 | 7770.0 | 562.0 | 16331452 | 2011-09-05 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 30466 | 44 | 27.0 | 7.0 | 1.0 | 9.0 | 1975.0 | 2.0 | 6.0 | 3.0 | Otradnoe | 1.384021 | 0.659002 | 8.158093 | 0.132645 | 0.349899 | 1.972527 | 3.762408 | 8.361875 | 10.543724 | 11.118577 | 12.599292 | 61396 | 0.096270 | 0.300323 | 5088.0 | 12721.0 | NaN | 7400000 | 2015-06-30 |
| 30467 | 86 | 59.0 | 3.0 | 2.0 | 9.0 | 1935.0 | 4.0 | 10.0 | 3.0 | Tverskoe | 1.060577 | 0.781217 | 9.374609 | 0.276256 | 0.362681 | 1.036452 | 13.100989 | 1.238732 | 1.203215 | 1.874868 | 3.269284 | 116742 | 0.065444 | 0.000078 | 1874.0 | 6772.0 | 1046.0 | 25000000 | 2015-06-30 |
| 30468 | 45 | NaN | 10.0 | 1.0 | 20.0 | NaN | 1.0 | 1.0 | 1.0 | Poselenie Vnukovskoe | 2.152792 | 1.722233 | 20.666800 | 0.897889 | 1.234235 | 4.566595 | 7.123215 | 17.148737 | 19.868997 | 21.038561 | 21.905792 | 17790 | 0.496315 | 0.007122 | NaN | NaN | NaN | 6970959 | 2015-06-30 |
| 30469 | 64 | 32.0 | 5.0 | 1.0 | 15.0 | 2003.0 | 2.0 | 11.0 | 2.0 | Obruchevskoe | 3.377814 | 2.047312 | 24.567748 | 0.203020 | 0.130667 | 1.772506 | 2.327138 | 8.940313 | 11.752036 | 12.872535 | 13.622569 | 83844 | 0.167526 | 0.093443 | 2372.0 | 6083.0 | 3300.0 | 13500000 | 2015-06-30 |
| 30470 | 43 | 28.0 | 1.0 | 1.0 | 9.0 | 1968.0 | 2.0 | 6.0 | 2.0 | Novogireevo | 0.584636 | 0.454650 | 5.455795 | 0.093619 | 0.378950 | 0.848766 | 1.920884 | 6.809408 | 9.675169 | 10.228634 | 11.812614 | 72131 | 0.063755 | 0.038693 | 2215.0 | 5824.0 | 1015.0 | 5600000 | 2015-06-30 |
30471 rows × 29 columns
train_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 30471 entries, 0 to 30470 Data columns (total 29 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 full_sq 30471 non-null int64 1 life_sq 24088 non-null float64 2 floor 30304 non-null float64 3 material 20899 non-null float64 4 max_floor 20899 non-null float64 5 build_year 16866 non-null float64 6 num_room 20899 non-null float64 7 kitch_sq 20899 non-null float64 8 state 16912 non-null float64 9 sub_area 30471 non-null object 10 metro_min_avto 30471 non-null float64 11 metro_km_avto 30471 non-null float64 12 metro_min_walk 30446 non-null float64 13 kindergarten_km 30471 non-null float64 14 school_km 30471 non-null float64 15 park_km 30471 non-null float64 16 mkad_km 30471 non-null float64 17 ttk_km 30471 non-null float64 18 sadovoe_km 30471 non-null float64 19 bulvar_ring_km 30471 non-null float64 20 kremlin_km 30471 non-null float64 21 full_all 30471 non-null int64 22 green_zone_part 30471 non-null float64 23 indust_part 30471 non-null float64 24 preschool_quota 23783 non-null float64 25 school_quota 23786 non-null float64 26 hospital_beds_raion 16030 non-null float64 27 price_doc 30471 non-null int64 28 timestamp 30471 non-null object dtypes: float64(24), int64(3), object(2) memory usage: 6.7+ MB
train_df.describe()
| full_sq | life_sq | floor | material | max_floor | build_year | num_room | kitch_sq | state | metro_min_avto | metro_km_avto | metro_min_walk | kindergarten_km | school_km | park_km | mkad_km | ttk_km | sadovoe_km | bulvar_ring_km | kremlin_km | full_all | green_zone_part | indust_part | preschool_quota | school_quota | hospital_beds_raion | price_doc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 30471.000000 | 24088.000000 | 30304.000000 | 20899.000000 | 20899.000000 | 1.686600e+04 | 20899.000000 | 20899.000000 | 16912.000000 | 30471.000000 | 30471.000000 | 30446.000000 | 30471.000000 | 30471.000000 | 30471.000000 | 30471.000000 | 30471.000000 | 30471.000000 | 30471.000000 | 30471.000000 | 3.047100e+04 | 30471.000000 | 30471.000000 | 23783.000000 | 23786.000000 | 16030.000000 | 3.047100e+04 |
| mean | 54.214269 | 34.403271 | 7.670803 | 1.827121 | 12.558974 | 3.068057e+03 | 1.909804 | 6.399301 | 2.107025 | 4.961273 | 3.701464 | 42.737601 | 0.981684 | 1.323956 | 3.099942 | 6.274764 | 11.318152 | 14.056721 | 15.023338 | 16.044808 | 1.463062e+05 | 0.218922 | 0.118871 | 3271.272464 | 8324.970739 | 1190.738677 | 7.123035e+06 |
| std | 38.031487 | 52.285733 | 5.319989 | 1.481154 | 6.756550 | 1.543878e+05 | 0.851805 | 28.265979 | 0.880148 | 6.553515 | 5.808432 | 69.303131 | 2.121956 | 3.058145 | 3.965363 | 5.142492 | 8.066780 | 8.343162 | 8.475342 | 8.441964 | 2.830251e+05 | 0.175090 | 0.118688 | 2169.759592 | 4289.734174 | 1057.015001 | 4.780111e+06 |
| min | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000e+00 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000470 | 0.000000 | 0.003737 | 0.013626 | 0.001935 | 0.000355 | 0.001947 | 0.072897 | 2.546000e+03 | 0.001879 | 0.000000 | 0.000000 | 1012.000000 | 0.000000 | 1.000000e+05 |
| 25% | 38.000000 | 20.000000 | 3.000000 | 1.000000 | 9.000000 | 1.967000e+03 | 1.000000 | 1.000000 | 1.000000 | 1.721280 | 1.036568 | 11.484150 | 0.199946 | 0.269716 | 0.973297 | 2.633404 | 5.339845 | 8.346336 | 9.256658 | 10.460471 | 2.817900e+04 | 0.063755 | 0.019509 | 1874.000000 | 5782.000000 | 520.000000 | 4.740002e+06 |
| 50% | 49.000000 | 30.000000 | 6.500000 | 1.000000 | 12.000000 | 1.979000e+03 | 2.000000 | 6.000000 | 2.000000 | 2.803299 | 1.783625 | 20.447209 | 0.353762 | 0.474862 | 1.803889 | 5.467510 | 9.842632 | 12.748712 | 13.611476 | 14.879161 | 8.521900e+04 | 0.167526 | 0.072158 | 2854.000000 | 7377.000000 | 990.000000 | 6.274411e+06 |
| 75% | 63.000000 | 43.000000 | 11.000000 | 2.000000 | 17.000000 | 2.005000e+03 | 2.000000 | 9.000000 | 3.000000 | 4.831733 | 3.776836 | 45.322032 | 0.971417 | 0.886531 | 3.404787 | 8.184752 | 15.675454 | 18.716620 | 19.945193 | 20.666814 | 1.251110e+05 | 0.336177 | 0.195781 | 4050.000000 | 9891.000000 | 1786.000000 | 8.300000e+06 |
| max | 5326.000000 | 7478.000000 | 77.000000 | 6.000000 | 117.000000 | 2.005201e+07 | 19.000000 | 2014.000000 | 33.000000 | 61.438472 | 74.905763 | 711.215806 | 29.085774 | 47.394706 | 47.351538 | 53.277832 | 66.033200 | 68.853047 | 69.984874 | 70.738769 | 1.716730e+06 | 0.852923 | 0.521867 | 11926.000000 | 24750.000000 | 4849.000000 | 1.111111e+08 |
print(dict(train_df.sub_area.value_counts()))
{'Poselenie Sosenskoe': 1776, 'Nekrasovka': 1611, 'Poselenie Vnukovskoe': 1372, 'Poselenie Moskovskij': 925, 'Poselenie Voskresenskoe': 713, 'Mitino': 679, 'Tverskoe': 678, 'Krjukovo': 518, "Mar'ino": 508, 'Poselenie Filimonkovskoe': 496, 'Juzhnoe Butovo': 451, 'Poselenie Shherbinka': 443, 'Solncevo': 421, 'Zapadnoe Degunino': 410, 'Poselenie Desjonovskoe': 362, 'Otradnoe': 353, 'Nagatinskij Zaton': 327, 'Bogorodskoe': 305, 'Nagornoe': 305, 'Strogino': 301, 'Izmajlovo': 300, "Tekstil'shhiki": 298, 'Ljublino': 297, "Gol'janovo": 295, 'Severnoe Tushino': 282, 'Chertanovo Juzhnoe': 273, 'Birjulevo Vostochnoe': 268, 'Vyhino-Zhulebino': 264, 'Horoshevo-Mnevniki': 262, 'Zjuzino': 259, 'Ochakovo-Matveevskoe': 255, 'Perovo': 247, 'Ramenki': 241, 'Kosino-Uhtomskoe': 237, 'Jasenevo': 237, 'Bibirevo': 230, 'Golovinskoe': 224, "Kon'kovo": 220, 'Caricyno': 220, "Kuz'minki": 220, 'Veshnjaki': 213, 'Akademicheskoe': 211, 'Orehovo-Borisovo Juzhnoe': 208, 'Koptevo': 207, 'Orehovo-Borisovo Severnoe': 206, 'Novogireevo': 201, 'Chertanovo Severnoe': 200, 'Danilovskoe': 199, 'Mozhajskoe': 197, 'Ivanovskoe': 197, "Chertanovo Central'noe": 196, 'Pechatniki': 192, 'Presnenskoe': 190, 'Sokolinaja Gora': 188, 'Obruchevskoe': 185, 'Kuncevo': 184, 'Severnoe Butovo': 182, 'Brateevo': 182, 'Rjazanskij': 180, 'Hovrino': 178, 'Losinoostrovskoe': 177, 'Juzhnoe Tushino': 175, 'Dmitrovskoe': 174, 'Taganskoe': 173, 'Severnoe Medvedkovo': 167, 'Beskudnikovskoe': 166, 'Teplyj Stan': 165, 'Pokrovskoe Streshnevo': 164, 'Severnoe Izmajlovo': 163, 'Troickij okrug': 158, 'Cheremushki': 158, 'Nagatino-Sadovniki': 158, 'Shhukino': 155, 'Timirjazevskoe': 154, 'Vostochnoe Izmajlovo': 154, 'Preobrazhenskoe': 152, 'Novo-Peredelkino': 149, 'Filevskij Park': 148, 'Poselenie Novofedorovskoe': 148, 'Lomonosovskoe': 147, 'Kotlovka': 147, 'Juzhnoe Medvedkovo': 143, 'Poselenie Pervomajskoe': 142, 'Novokosino': 139, 'Fili Davydkovo': 137, 'Horoshevskoe': 136, 'Levoberezhnoe': 135, 'Donskoe': 135, 'Sviblovo': 131, 'Vojkovskoe': 131, 'Zjablikovo': 127, 'Troparevo-Nikulino': 126, 'Juzhnoportovoe': 126, 'Lianozovo': 126, 'Ajeroport': 123, 'Babushkinskoe': 123, 'Jaroslavskoe': 121, 'Lefortovo': 119, 'Vostochnoe Degunino': 118, "Mar'ina Roshha": 116, 'Birjulevo Zapadnoe': 115, 'Matushkino': 111, 'Savelki': 105, 'Krylatskoe': 103, 'Butyrskoe': 101, 'Alekseevskoe': 100, 'Prospekt Vernadskogo': 100, 'Silino': 100, "Moskvorech'e-Saburovo": 99, 'Basmannoe': 98, 'Meshhanskoe': 94, 'Staroe Krjukovo': 92, 'Hamovniki': 90, 'Savelovskoe': 85, 'Marfino': 85, 'Jakimanka': 81, 'Gagarinskoe': 79, 'Ostankinskoe': 79, 'Nizhegorodskoe': 77, 'Sokol': 72, "Altuf'evskoe": 68, 'Rostokino': 64, 'Kurkino': 62, 'Begovoe': 60, "Sokol'niki": 60, 'Metrogorodok': 58, 'Dorogomilovo': 56, "Zamoskvorech'e": 50, 'Kapotnja': 49, 'Vnukovo': 44, 'Severnoe': 37, "Krasnosel'skoe": 37, 'Poselenie Rjazanovskoe': 34, 'Poselenie Rogovskoe': 31, 'Poselenie Krasnopahorskoe': 27, 'Poselenie Kokoshkino': 20, 'Poselenie Mosrentgen': 19, 'Arbat': 15, 'Poselenie Voronovskoe': 7, 'Vostochnoe': 7, 'Poselenie Marushkinskoe': 6, 'Molzhaninovskoe': 3, 'Poselenie Shhapovskoe': 2, 'Poselenie Kievskij': 2, 'Poselenie Klenovskoe': 1, 'Poselenie Mihajlovo-Jarcevskoe': 1}
Удаляем записи, которые не входят в период с августа 2011 по июнь 2015¶
train_df = train_df[(train_df['timestamp'] >= '2011-08-01') & (train_df['timestamp'] <= '2015-06-30')]
train_df
| full_sq | life_sq | floor | material | max_floor | build_year | num_room | kitch_sq | state | sub_area | metro_min_avto | metro_km_avto | metro_min_walk | kindergarten_km | school_km | park_km | mkad_km | ttk_km | sadovoe_km | bulvar_ring_km | kremlin_km | full_all | green_zone_part | indust_part | preschool_quota | school_quota | hospital_beds_raion | price_doc | timestamp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 43 | 27.0 | 4.0 | NaN | NaN | NaN | NaN | NaN | NaN | Bibirevo | 2.590241 | 1.131260 | 13.575119 | 0.145700 | 0.177975 | 2.158587 | 1.422391 | 10.918587 | 13.100618 | 13.675657 | 15.156211 | 86206 | 0.189727 | 0.000070 | 5001.0 | 11065.0 | 240.0 | 5850000 | 2011-08-20 |
| 1 | 34 | 19.0 | 3.0 | NaN | NaN | NaN | NaN | NaN | NaN | Nagatinskij Zaton | 0.936700 | 0.647337 | 7.620630 | 0.147754 | 0.273345 | 0.550690 | 9.503405 | 3.103996 | 6.444333 | 8.132640 | 8.698054 | 76284 | 0.372602 | 0.049637 | 3119.0 | 6237.0 | 229.0 | 6000000 | 2011-08-23 |
| 2 | 43 | 29.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | Tekstil'shhiki | 2.120999 | 1.637996 | 17.351515 | 0.049102 | 0.158072 | 0.374848 | 5.604800 | 2.927487 | 6.963403 | 8.054252 | 9.067885 | 101982 | 0.112560 | 0.118537 | 1463.0 | 5580.0 | 1183.0 | 5700000 | 2011-08-27 |
| 3 | 89 | 50.0 | 9.0 | NaN | NaN | NaN | NaN | NaN | NaN | Mitino | 1.489049 | 0.984537 | 11.565624 | 0.179441 | 0.236455 | 0.078090 | 2.677824 | 14.606501 | 17.457198 | 18.309433 | 19.487005 | 21155 | 0.194703 | 0.069753 | 6839.0 | 17063.0 | NaN | 13100000 | 2011-09-01 |
| 4 | 77 | 77.0 | 4.0 | NaN | NaN | NaN | NaN | NaN | NaN | Basmannoe | 1.257186 | 0.876620 | 8.266305 | 0.247901 | 0.376838 | 0.258289 | 11.616653 | 1.721834 | 0.046810 | 0.787593 | 2.578671 | 28179 | 0.015234 | 0.037316 | 3240.0 | 7770.0 | 562.0 | 16331452 | 2011-09-05 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 30466 | 44 | 27.0 | 7.0 | 1.0 | 9.0 | 1975.0 | 2.0 | 6.0 | 3.0 | Otradnoe | 1.384021 | 0.659002 | 8.158093 | 0.132645 | 0.349899 | 1.972527 | 3.762408 | 8.361875 | 10.543724 | 11.118577 | 12.599292 | 61396 | 0.096270 | 0.300323 | 5088.0 | 12721.0 | NaN | 7400000 | 2015-06-30 |
| 30467 | 86 | 59.0 | 3.0 | 2.0 | 9.0 | 1935.0 | 4.0 | 10.0 | 3.0 | Tverskoe | 1.060577 | 0.781217 | 9.374609 | 0.276256 | 0.362681 | 1.036452 | 13.100989 | 1.238732 | 1.203215 | 1.874868 | 3.269284 | 116742 | 0.065444 | 0.000078 | 1874.0 | 6772.0 | 1046.0 | 25000000 | 2015-06-30 |
| 30468 | 45 | NaN | 10.0 | 1.0 | 20.0 | NaN | 1.0 | 1.0 | 1.0 | Poselenie Vnukovskoe | 2.152792 | 1.722233 | 20.666800 | 0.897889 | 1.234235 | 4.566595 | 7.123215 | 17.148737 | 19.868997 | 21.038561 | 21.905792 | 17790 | 0.496315 | 0.007122 | NaN | NaN | NaN | 6970959 | 2015-06-30 |
| 30469 | 64 | 32.0 | 5.0 | 1.0 | 15.0 | 2003.0 | 2.0 | 11.0 | 2.0 | Obruchevskoe | 3.377814 | 2.047312 | 24.567748 | 0.203020 | 0.130667 | 1.772506 | 2.327138 | 8.940313 | 11.752036 | 12.872535 | 13.622569 | 83844 | 0.167526 | 0.093443 | 2372.0 | 6083.0 | 3300.0 | 13500000 | 2015-06-30 |
| 30470 | 43 | 28.0 | 1.0 | 1.0 | 9.0 | 1968.0 | 2.0 | 6.0 | 2.0 | Novogireevo | 0.584636 | 0.454650 | 5.455795 | 0.093619 | 0.378950 | 0.848766 | 1.920884 | 6.809408 | 9.675169 | 10.228634 | 11.812614 | 72131 | 0.063755 | 0.038693 | 2215.0 | 5824.0 | 1015.0 | 5600000 | 2015-06-30 |
30471 rows × 29 columns
Удаляем строки с аномальным годом постройки¶
train_df = train_df[(train_df['build_year'] >= 1780) & (train_df['build_year'] <= 2015)]
train_df
| full_sq | life_sq | floor | material | max_floor | build_year | num_room | kitch_sq | state | sub_area | metro_min_avto | metro_km_avto | metro_min_walk | kindergarten_km | school_km | park_km | mkad_km | ttk_km | sadovoe_km | bulvar_ring_km | kremlin_km | full_all | green_zone_part | indust_part | preschool_quota | school_quota | hospital_beds_raion | price_doc | timestamp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8056 | 11 | 11.0 | 2.0 | 2.0 | 5.0 | 1907.0 | 1.0 | 12.0 | 3.0 | Hamovniki | 1.798776 | 1.291876 | 6.990281 | 0.377428 | 0.185809 | 0.985279 | 12.114726 | 2.301037 | 0.189294 | 1.310001 | 2.109561 | 75377 | 0.048791 | 0.000000 | 165.0 | 9337.0 | 4702.0 | 2750000 | 2013-05-21 |
| 8135 | 53 | 30.0 | 10.0 | 1.0 | 16.0 | 1980.0 | 2.0 | 8.0 | 3.0 | Lianozovo | 1.959499 | 1.503698 | 18.420277 | 0.408673 | 0.364994 | 0.875814 | 2.169200 | 11.018216 | 13.270117 | 13.854330 | 15.345902 | 68630 | 0.258663 | 0.101872 | 2703.0 | 7236.0 | NaN | 9000000 | 2013-05-25 |
| 8153 | 77 | 41.0 | 2.0 | 6.0 | 17.0 | 2014.0 | 3.0 | 12.0 | 1.0 | Poselenie Voskresenskoe | 3.121542 | 2.436882 | 29.242588 | 0.745286 | 0.936324 | 1.773759 | 7.371716 | 20.624073 | 23.753388 | 25.032110 | 25.735256 | 9553 | 0.262459 | 0.017647 | NaN | NaN | NaN | 7011550 | 2013-05-27 |
| 8154 | 45 | 27.0 | 6.0 | 1.0 | 9.0 | 1970.0 | 2.0 | 6.0 | 3.0 | Severnoe Butovo | 1.817706 | 0.827413 | 9.928954 | 0.495571 | 0.183503 | 2.590344 | 1.704870 | 15.482046 | 18.682566 | 20.077081 | 20.728839 | 78616 | 0.579645 | 0.000000 | 3617.0 | 7653.0 | 30.0 | 7100000 | 2013-05-27 |
| 8175 | 38 | 20.0 | 15.0 | 1.0 | 16.0 | 1982.0 | 1.0 | 8.0 | NaN | Filevskij Park | 6.065337 | 4.400699 | 52.407275 | 0.668816 | 0.782119 | 1.755323 | 6.856666 | 3.520763 | 6.651871 | 7.519704 | 8.569880 | 112804 | 0.343754 | 0.238617 | 1522.0 | 4904.0 | 705.0 | 6450000 | 2013-05-28 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 30463 | 56 | 29.0 | 13.0 | 1.0 | 14.0 | 2001.0 | 2.0 | 11.0 | 3.0 | Severnoe Tushino | 2.622565 | 1.580238 | 8.510351 | 0.225720 | 0.193474 | 1.876243 | 1.486707 | 11.896884 | 14.587359 | 15.339712 | 16.626186 | 53786 | 0.374068 | 0.000000 | 4116.0 | 9891.0 | 1145.0 | 12000000 | 2015-06-30 |
| 30466 | 44 | 27.0 | 7.0 | 1.0 | 9.0 | 1975.0 | 2.0 | 6.0 | 3.0 | Otradnoe | 1.384021 | 0.659002 | 8.158093 | 0.132645 | 0.349899 | 1.972527 | 3.762408 | 8.361875 | 10.543724 | 11.118577 | 12.599292 | 61396 | 0.096270 | 0.300323 | 5088.0 | 12721.0 | NaN | 7400000 | 2015-06-30 |
| 30467 | 86 | 59.0 | 3.0 | 2.0 | 9.0 | 1935.0 | 4.0 | 10.0 | 3.0 | Tverskoe | 1.060577 | 0.781217 | 9.374609 | 0.276256 | 0.362681 | 1.036452 | 13.100989 | 1.238732 | 1.203215 | 1.874868 | 3.269284 | 116742 | 0.065444 | 0.000078 | 1874.0 | 6772.0 | 1046.0 | 25000000 | 2015-06-30 |
| 30469 | 64 | 32.0 | 5.0 | 1.0 | 15.0 | 2003.0 | 2.0 | 11.0 | 2.0 | Obruchevskoe | 3.377814 | 2.047312 | 24.567748 | 0.203020 | 0.130667 | 1.772506 | 2.327138 | 8.940313 | 11.752036 | 12.872535 | 13.622569 | 83844 | 0.167526 | 0.093443 | 2372.0 | 6083.0 | 3300.0 | 13500000 | 2015-06-30 |
| 30470 | 43 | 28.0 | 1.0 | 1.0 | 9.0 | 1968.0 | 2.0 | 6.0 | 2.0 | Novogireevo | 0.584636 | 0.454650 | 5.455795 | 0.093619 | 0.378950 | 0.848766 | 1.920884 | 6.809408 | 9.675169 | 10.228634 | 11.812614 | 72131 | 0.063755 | 0.038693 | 2215.0 | 5824.0 | 1015.0 | 5600000 | 2015-06-30 |
15430 rows × 29 columns
Удаляем строки, в которых жилая площадь или площадь кухни превышает общую площадь¶
train_df = train_df[(train_df['life_sq'] <= train_df['full_sq']) & (train_df['kitch_sq'] <= train_df['full_sq'])]
train_df
| full_sq | life_sq | floor | material | max_floor | build_year | num_room | kitch_sq | state | sub_area | metro_min_avto | metro_km_avto | metro_min_walk | kindergarten_km | school_km | park_km | mkad_km | ttk_km | sadovoe_km | bulvar_ring_km | kremlin_km | full_all | green_zone_part | indust_part | preschool_quota | school_quota | hospital_beds_raion | price_doc | timestamp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8135 | 53 | 30.0 | 10.0 | 1.0 | 16.0 | 1980.0 | 2.0 | 8.0 | 3.0 | Lianozovo | 1.959499 | 1.503698 | 18.420277 | 0.408673 | 0.364994 | 0.875814 | 2.169200 | 11.018216 | 13.270117 | 13.854330 | 15.345902 | 68630 | 0.258663 | 0.101872 | 2703.0 | 7236.0 | NaN | 9000000 | 2013-05-25 |
| 8153 | 77 | 41.0 | 2.0 | 6.0 | 17.0 | 2014.0 | 3.0 | 12.0 | 1.0 | Poselenie Voskresenskoe | 3.121542 | 2.436882 | 29.242588 | 0.745286 | 0.936324 | 1.773759 | 7.371716 | 20.624073 | 23.753388 | 25.032110 | 25.735256 | 9553 | 0.262459 | 0.017647 | NaN | NaN | NaN | 7011550 | 2013-05-27 |
| 8154 | 45 | 27.0 | 6.0 | 1.0 | 9.0 | 1970.0 | 2.0 | 6.0 | 3.0 | Severnoe Butovo | 1.817706 | 0.827413 | 9.928954 | 0.495571 | 0.183503 | 2.590344 | 1.704870 | 15.482046 | 18.682566 | 20.077081 | 20.728839 | 78616 | 0.579645 | 0.000000 | 3617.0 | 7653.0 | 30.0 | 7100000 | 2013-05-27 |
| 8175 | 38 | 20.0 | 15.0 | 1.0 | 16.0 | 1982.0 | 1.0 | 8.0 | NaN | Filevskij Park | 6.065337 | 4.400699 | 52.407275 | 0.668816 | 0.782119 | 1.755323 | 6.856666 | 3.520763 | 6.651871 | 7.519704 | 8.569880 | 112804 | 0.343754 | 0.238617 | 1522.0 | 4904.0 | 705.0 | 6450000 | 2013-05-28 |
| 8255 | 74 | 46.0 | 12.0 | 1.0 | 24.0 | 2004.0 | 3.0 | 9.0 | 3.0 | Juzhnoe Medvedkovo | 3.509942 | 1.887630 | 22.380717 | 0.128857 | 0.140458 | 0.593471 | 2.349730 | 9.120782 | 11.418023 | 12.095526 | 13.529297 | 27992 | 0.157332 | 0.232205 | 2200.0 | 6476.0 | 80.0 | 12100000 | 2013-05-30 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 30463 | 56 | 29.0 | 13.0 | 1.0 | 14.0 | 2001.0 | 2.0 | 11.0 | 3.0 | Severnoe Tushino | 2.622565 | 1.580238 | 8.510351 | 0.225720 | 0.193474 | 1.876243 | 1.486707 | 11.896884 | 14.587359 | 15.339712 | 16.626186 | 53786 | 0.374068 | 0.000000 | 4116.0 | 9891.0 | 1145.0 | 12000000 | 2015-06-30 |
| 30466 | 44 | 27.0 | 7.0 | 1.0 | 9.0 | 1975.0 | 2.0 | 6.0 | 3.0 | Otradnoe | 1.384021 | 0.659002 | 8.158093 | 0.132645 | 0.349899 | 1.972527 | 3.762408 | 8.361875 | 10.543724 | 11.118577 | 12.599292 | 61396 | 0.096270 | 0.300323 | 5088.0 | 12721.0 | NaN | 7400000 | 2015-06-30 |
| 30467 | 86 | 59.0 | 3.0 | 2.0 | 9.0 | 1935.0 | 4.0 | 10.0 | 3.0 | Tverskoe | 1.060577 | 0.781217 | 9.374609 | 0.276256 | 0.362681 | 1.036452 | 13.100989 | 1.238732 | 1.203215 | 1.874868 | 3.269284 | 116742 | 0.065444 | 0.000078 | 1874.0 | 6772.0 | 1046.0 | 25000000 | 2015-06-30 |
| 30469 | 64 | 32.0 | 5.0 | 1.0 | 15.0 | 2003.0 | 2.0 | 11.0 | 2.0 | Obruchevskoe | 3.377814 | 2.047312 | 24.567748 | 0.203020 | 0.130667 | 1.772506 | 2.327138 | 8.940313 | 11.752036 | 12.872535 | 13.622569 | 83844 | 0.167526 | 0.093443 | 2372.0 | 6083.0 | 3300.0 | 13500000 | 2015-06-30 |
| 30470 | 43 | 28.0 | 1.0 | 1.0 | 9.0 | 1968.0 | 2.0 | 6.0 | 2.0 | Novogireevo | 0.584636 | 0.454650 | 5.455795 | 0.093619 | 0.378950 | 0.848766 | 1.920884 | 6.809408 | 9.675169 | 10.228634 | 11.812614 | 72131 | 0.063755 | 0.038693 | 2215.0 | 5824.0 | 1015.0 | 5600000 | 2015-06-30 |
14931 rows × 29 columns
Удаляем дублирующиеся строки¶
train_df = train_df.drop_duplicates()
train_df
| full_sq | life_sq | floor | material | max_floor | build_year | num_room | kitch_sq | state | sub_area | metro_min_avto | metro_km_avto | metro_min_walk | kindergarten_km | school_km | park_km | mkad_km | ttk_km | sadovoe_km | bulvar_ring_km | kremlin_km | full_all | green_zone_part | indust_part | preschool_quota | school_quota | hospital_beds_raion | price_doc | timestamp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8135 | 53 | 30.0 | 10.0 | 1.0 | 16.0 | 1980.0 | 2.0 | 8.0 | 3.0 | Lianozovo | 1.959499 | 1.503698 | 18.420277 | 0.408673 | 0.364994 | 0.875814 | 2.169200 | 11.018216 | 13.270117 | 13.854330 | 15.345902 | 68630 | 0.258663 | 0.101872 | 2703.0 | 7236.0 | NaN | 9000000 | 2013-05-25 |
| 8153 | 77 | 41.0 | 2.0 | 6.0 | 17.0 | 2014.0 | 3.0 | 12.0 | 1.0 | Poselenie Voskresenskoe | 3.121542 | 2.436882 | 29.242588 | 0.745286 | 0.936324 | 1.773759 | 7.371716 | 20.624073 | 23.753388 | 25.032110 | 25.735256 | 9553 | 0.262459 | 0.017647 | NaN | NaN | NaN | 7011550 | 2013-05-27 |
| 8154 | 45 | 27.0 | 6.0 | 1.0 | 9.0 | 1970.0 | 2.0 | 6.0 | 3.0 | Severnoe Butovo | 1.817706 | 0.827413 | 9.928954 | 0.495571 | 0.183503 | 2.590344 | 1.704870 | 15.482046 | 18.682566 | 20.077081 | 20.728839 | 78616 | 0.579645 | 0.000000 | 3617.0 | 7653.0 | 30.0 | 7100000 | 2013-05-27 |
| 8175 | 38 | 20.0 | 15.0 | 1.0 | 16.0 | 1982.0 | 1.0 | 8.0 | NaN | Filevskij Park | 6.065337 | 4.400699 | 52.407275 | 0.668816 | 0.782119 | 1.755323 | 6.856666 | 3.520763 | 6.651871 | 7.519704 | 8.569880 | 112804 | 0.343754 | 0.238617 | 1522.0 | 4904.0 | 705.0 | 6450000 | 2013-05-28 |
| 8255 | 74 | 46.0 | 12.0 | 1.0 | 24.0 | 2004.0 | 3.0 | 9.0 | 3.0 | Juzhnoe Medvedkovo | 3.509942 | 1.887630 | 22.380717 | 0.128857 | 0.140458 | 0.593471 | 2.349730 | 9.120782 | 11.418023 | 12.095526 | 13.529297 | 27992 | 0.157332 | 0.232205 | 2200.0 | 6476.0 | 80.0 | 12100000 | 2013-05-30 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 30463 | 56 | 29.0 | 13.0 | 1.0 | 14.0 | 2001.0 | 2.0 | 11.0 | 3.0 | Severnoe Tushino | 2.622565 | 1.580238 | 8.510351 | 0.225720 | 0.193474 | 1.876243 | 1.486707 | 11.896884 | 14.587359 | 15.339712 | 16.626186 | 53786 | 0.374068 | 0.000000 | 4116.0 | 9891.0 | 1145.0 | 12000000 | 2015-06-30 |
| 30466 | 44 | 27.0 | 7.0 | 1.0 | 9.0 | 1975.0 | 2.0 | 6.0 | 3.0 | Otradnoe | 1.384021 | 0.659002 | 8.158093 | 0.132645 | 0.349899 | 1.972527 | 3.762408 | 8.361875 | 10.543724 | 11.118577 | 12.599292 | 61396 | 0.096270 | 0.300323 | 5088.0 | 12721.0 | NaN | 7400000 | 2015-06-30 |
| 30467 | 86 | 59.0 | 3.0 | 2.0 | 9.0 | 1935.0 | 4.0 | 10.0 | 3.0 | Tverskoe | 1.060577 | 0.781217 | 9.374609 | 0.276256 | 0.362681 | 1.036452 | 13.100989 | 1.238732 | 1.203215 | 1.874868 | 3.269284 | 116742 | 0.065444 | 0.000078 | 1874.0 | 6772.0 | 1046.0 | 25000000 | 2015-06-30 |
| 30469 | 64 | 32.0 | 5.0 | 1.0 | 15.0 | 2003.0 | 2.0 | 11.0 | 2.0 | Obruchevskoe | 3.377814 | 2.047312 | 24.567748 | 0.203020 | 0.130667 | 1.772506 | 2.327138 | 8.940313 | 11.752036 | 12.872535 | 13.622569 | 83844 | 0.167526 | 0.093443 | 2372.0 | 6083.0 | 3300.0 | 13500000 | 2015-06-30 |
| 30470 | 43 | 28.0 | 1.0 | 1.0 | 9.0 | 1968.0 | 2.0 | 6.0 | 2.0 | Novogireevo | 0.584636 | 0.454650 | 5.455795 | 0.093619 | 0.378950 | 0.848766 | 1.920884 | 6.809408 | 9.675169 | 10.228634 | 11.812614 | 72131 | 0.063755 | 0.038693 | 2215.0 | 5824.0 | 1015.0 | 5600000 | 2015-06-30 |
14930 rows × 29 columns
Работаем с пропусками в данных¶
train_df.info()
<class 'pandas.core.frame.DataFrame'> Index: 14930 entries, 8135 to 30470 Data columns (total 29 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 full_sq 14930 non-null int64 1 life_sq 14930 non-null float64 2 floor 14930 non-null float64 3 material 14930 non-null float64 4 max_floor 14930 non-null float64 5 build_year 14930 non-null float64 6 num_room 14930 non-null float64 7 kitch_sq 14930 non-null float64 8 state 13729 non-null float64 9 sub_area 14930 non-null object 10 metro_min_avto 14930 non-null float64 11 metro_km_avto 14930 non-null float64 12 metro_min_walk 14914 non-null float64 13 kindergarten_km 14930 non-null float64 14 school_km 14930 non-null float64 15 park_km 14930 non-null float64 16 mkad_km 14930 non-null float64 17 ttk_km 14930 non-null float64 18 sadovoe_km 14930 non-null float64 19 bulvar_ring_km 14930 non-null float64 20 kremlin_km 14930 non-null float64 21 full_all 14930 non-null int64 22 green_zone_part 14930 non-null float64 23 indust_part 14930 non-null float64 24 preschool_quota 13759 non-null float64 25 school_quota 13762 non-null float64 26 hospital_beds_raion 8873 non-null float64 27 price_doc 14930 non-null int64 28 timestamp 14930 non-null object dtypes: float64(24), int64(3), object(2) memory usage: 3.4+ MB
Во всех столбцах, кроме hospital_beds_raion количество пропущенных значений несущественно, поэтому разумно удалить столбец hospital_beds_raion, так как он не такой весомый, как остальные, а также удалить строки с пропусками
train_df = train_df.drop(columns='hospital_beds_raion')
train_df = train_df.dropna()
train_df
| full_sq | life_sq | floor | material | max_floor | build_year | num_room | kitch_sq | state | sub_area | metro_min_avto | metro_km_avto | metro_min_walk | kindergarten_km | school_km | park_km | mkad_km | ttk_km | sadovoe_km | bulvar_ring_km | kremlin_km | full_all | green_zone_part | indust_part | preschool_quota | school_quota | price_doc | timestamp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8135 | 53 | 30.0 | 10.0 | 1.0 | 16.0 | 1980.0 | 2.0 | 8.0 | 3.0 | Lianozovo | 1.959499 | 1.503698 | 18.420277 | 0.408673 | 0.364994 | 0.875814 | 2.169200 | 11.018216 | 13.270117 | 13.854330 | 15.345902 | 68630 | 0.258663 | 0.101872 | 2703.0 | 7236.0 | 9000000 | 2013-05-25 |
| 8154 | 45 | 27.0 | 6.0 | 1.0 | 9.0 | 1970.0 | 2.0 | 6.0 | 3.0 | Severnoe Butovo | 1.817706 | 0.827413 | 9.928954 | 0.495571 | 0.183503 | 2.590344 | 1.704870 | 15.482046 | 18.682566 | 20.077081 | 20.728839 | 78616 | 0.579645 | 0.000000 | 3617.0 | 7653.0 | 7100000 | 2013-05-27 |
| 8255 | 74 | 46.0 | 12.0 | 1.0 | 24.0 | 2004.0 | 3.0 | 9.0 | 3.0 | Juzhnoe Medvedkovo | 3.509942 | 1.887630 | 22.380717 | 0.128857 | 0.140458 | 0.593471 | 2.349730 | 9.120782 | 11.418023 | 12.095526 | 13.529297 | 27992 | 0.157332 | 0.232205 | 2200.0 | 6476.0 | 12100000 | 2013-05-30 |
| 8282 | 51 | 30.0 | 7.0 | 1.0 | 17.0 | 2003.0 | 2.0 | 9.0 | 3.0 | Solncevo | 2.861107 | 1.682108 | 20.896099 | 0.387735 | 0.827697 | 1.766185 | 2.422245 | 12.755092 | 15.453518 | 16.623270 | 17.455159 | 125111 | 0.083810 | 0.371149 | 3953.0 | 9700.0 | 7700000 | 2013-06-01 |
| 8287 | 77 | 50.0 | 3.0 | 2.0 | 5.0 | 1957.0 | 3.0 | 8.0 | 2.0 | Ajeroport | 2.003856 | 1.501540 | 10.135706 | 0.128005 | 0.508435 | 0.741615 | 11.246993 | 0.449498 | 3.145335 | 3.872968 | 5.200064 | 1100773 | 0.109947 | 0.050272 | 2058.0 | 4975.0 | 11700000 | 2013-06-03 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 30463 | 56 | 29.0 | 13.0 | 1.0 | 14.0 | 2001.0 | 2.0 | 11.0 | 3.0 | Severnoe Tushino | 2.622565 | 1.580238 | 8.510351 | 0.225720 | 0.193474 | 1.876243 | 1.486707 | 11.896884 | 14.587359 | 15.339712 | 16.626186 | 53786 | 0.374068 | 0.000000 | 4116.0 | 9891.0 | 12000000 | 2015-06-30 |
| 30466 | 44 | 27.0 | 7.0 | 1.0 | 9.0 | 1975.0 | 2.0 | 6.0 | 3.0 | Otradnoe | 1.384021 | 0.659002 | 8.158093 | 0.132645 | 0.349899 | 1.972527 | 3.762408 | 8.361875 | 10.543724 | 11.118577 | 12.599292 | 61396 | 0.096270 | 0.300323 | 5088.0 | 12721.0 | 7400000 | 2015-06-30 |
| 30467 | 86 | 59.0 | 3.0 | 2.0 | 9.0 | 1935.0 | 4.0 | 10.0 | 3.0 | Tverskoe | 1.060577 | 0.781217 | 9.374609 | 0.276256 | 0.362681 | 1.036452 | 13.100989 | 1.238732 | 1.203215 | 1.874868 | 3.269284 | 116742 | 0.065444 | 0.000078 | 1874.0 | 6772.0 | 25000000 | 2015-06-30 |
| 30469 | 64 | 32.0 | 5.0 | 1.0 | 15.0 | 2003.0 | 2.0 | 11.0 | 2.0 | Obruchevskoe | 3.377814 | 2.047312 | 24.567748 | 0.203020 | 0.130667 | 1.772506 | 2.327138 | 8.940313 | 11.752036 | 12.872535 | 13.622569 | 83844 | 0.167526 | 0.093443 | 2372.0 | 6083.0 | 13500000 | 2015-06-30 |
| 30470 | 43 | 28.0 | 1.0 | 1.0 | 9.0 | 1968.0 | 2.0 | 6.0 | 2.0 | Novogireevo | 0.584636 | 0.454650 | 5.455795 | 0.093619 | 0.378950 | 0.848766 | 1.920884 | 6.809408 | 9.675169 | 10.228634 | 11.812614 | 72131 | 0.063755 | 0.038693 | 2215.0 | 5824.0 | 5600000 | 2015-06-30 |
12662 rows × 28 columns
train_df.isna().sum()
full_sq 0 life_sq 0 floor 0 material 0 max_floor 0 build_year 0 num_room 0 kitch_sq 0 state 0 sub_area 0 metro_min_avto 0 metro_km_avto 0 metro_min_walk 0 kindergarten_km 0 school_km 0 park_km 0 mkad_km 0 ttk_km 0 sadovoe_km 0 bulvar_ring_km 0 kremlin_km 0 full_all 0 green_zone_part 0 indust_part 0 preschool_quota 0 school_quota 0 price_doc 0 timestamp 0 dtype: int64
Переводим дату и год постройки в число прошедших дней и лет соответственно¶
train_df['Date'] = pd.Timestamp('now').date()
train_df["timestamp"] = (pd.to_datetime(train_df["Date"]) - pd.to_datetime(train_df["timestamp"])).dt.days
train_df["build_year"] = (2024 - train_df["build_year"])
train_df
| full_sq | life_sq | floor | material | max_floor | build_year | num_room | kitch_sq | state | sub_area | metro_min_avto | metro_km_avto | metro_min_walk | kindergarten_km | school_km | park_km | mkad_km | ttk_km | sadovoe_km | bulvar_ring_km | kremlin_km | full_all | green_zone_part | indust_part | preschool_quota | school_quota | price_doc | timestamp | Date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8135 | 53 | 30.0 | 10.0 | 1.0 | 16.0 | 44.0 | 2.0 | 8.0 | 3.0 | Lianozovo | 1.959499 | 1.503698 | 18.420277 | 0.408673 | 0.364994 | 0.875814 | 2.169200 | 11.018216 | 13.270117 | 13.854330 | 15.345902 | 68630 | 0.258663 | 0.101872 | 2703.0 | 7236.0 | 9000000 | 3963 | 2024-03-31 |
| 8154 | 45 | 27.0 | 6.0 | 1.0 | 9.0 | 54.0 | 2.0 | 6.0 | 3.0 | Severnoe Butovo | 1.817706 | 0.827413 | 9.928954 | 0.495571 | 0.183503 | 2.590344 | 1.704870 | 15.482046 | 18.682566 | 20.077081 | 20.728839 | 78616 | 0.579645 | 0.000000 | 3617.0 | 7653.0 | 7100000 | 3961 | 2024-03-31 |
| 8255 | 74 | 46.0 | 12.0 | 1.0 | 24.0 | 20.0 | 3.0 | 9.0 | 3.0 | Juzhnoe Medvedkovo | 3.509942 | 1.887630 | 22.380717 | 0.128857 | 0.140458 | 0.593471 | 2.349730 | 9.120782 | 11.418023 | 12.095526 | 13.529297 | 27992 | 0.157332 | 0.232205 | 2200.0 | 6476.0 | 12100000 | 3958 | 2024-03-31 |
| 8282 | 51 | 30.0 | 7.0 | 1.0 | 17.0 | 21.0 | 2.0 | 9.0 | 3.0 | Solncevo | 2.861107 | 1.682108 | 20.896099 | 0.387735 | 0.827697 | 1.766185 | 2.422245 | 12.755092 | 15.453518 | 16.623270 | 17.455159 | 125111 | 0.083810 | 0.371149 | 3953.0 | 9700.0 | 7700000 | 3956 | 2024-03-31 |
| 8287 | 77 | 50.0 | 3.0 | 2.0 | 5.0 | 67.0 | 3.0 | 8.0 | 2.0 | Ajeroport | 2.003856 | 1.501540 | 10.135706 | 0.128005 | 0.508435 | 0.741615 | 11.246993 | 0.449498 | 3.145335 | 3.872968 | 5.200064 | 1100773 | 0.109947 | 0.050272 | 2058.0 | 4975.0 | 11700000 | 3954 | 2024-03-31 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 30463 | 56 | 29.0 | 13.0 | 1.0 | 14.0 | 23.0 | 2.0 | 11.0 | 3.0 | Severnoe Tushino | 2.622565 | 1.580238 | 8.510351 | 0.225720 | 0.193474 | 1.876243 | 1.486707 | 11.896884 | 14.587359 | 15.339712 | 16.626186 | 53786 | 0.374068 | 0.000000 | 4116.0 | 9891.0 | 12000000 | 3197 | 2024-03-31 |
| 30466 | 44 | 27.0 | 7.0 | 1.0 | 9.0 | 49.0 | 2.0 | 6.0 | 3.0 | Otradnoe | 1.384021 | 0.659002 | 8.158093 | 0.132645 | 0.349899 | 1.972527 | 3.762408 | 8.361875 | 10.543724 | 11.118577 | 12.599292 | 61396 | 0.096270 | 0.300323 | 5088.0 | 12721.0 | 7400000 | 3197 | 2024-03-31 |
| 30467 | 86 | 59.0 | 3.0 | 2.0 | 9.0 | 89.0 | 4.0 | 10.0 | 3.0 | Tverskoe | 1.060577 | 0.781217 | 9.374609 | 0.276256 | 0.362681 | 1.036452 | 13.100989 | 1.238732 | 1.203215 | 1.874868 | 3.269284 | 116742 | 0.065444 | 0.000078 | 1874.0 | 6772.0 | 25000000 | 3197 | 2024-03-31 |
| 30469 | 64 | 32.0 | 5.0 | 1.0 | 15.0 | 21.0 | 2.0 | 11.0 | 2.0 | Obruchevskoe | 3.377814 | 2.047312 | 24.567748 | 0.203020 | 0.130667 | 1.772506 | 2.327138 | 8.940313 | 11.752036 | 12.872535 | 13.622569 | 83844 | 0.167526 | 0.093443 | 2372.0 | 6083.0 | 13500000 | 3197 | 2024-03-31 |
| 30470 | 43 | 28.0 | 1.0 | 1.0 | 9.0 | 56.0 | 2.0 | 6.0 | 2.0 | Novogireevo | 0.584636 | 0.454650 | 5.455795 | 0.093619 | 0.378950 | 0.848766 | 1.920884 | 6.809408 | 9.675169 | 10.228634 | 11.812614 | 72131 | 0.063755 | 0.038693 | 2215.0 | 5824.0 | 5600000 | 3197 | 2024-03-31 |
12662 rows × 29 columns
train_df = train_df.drop(columns='Date')
train_df
| full_sq | life_sq | floor | material | max_floor | build_year | num_room | kitch_sq | state | sub_area | metro_min_avto | metro_km_avto | metro_min_walk | kindergarten_km | school_km | park_km | mkad_km | ttk_km | sadovoe_km | bulvar_ring_km | kremlin_km | full_all | green_zone_part | indust_part | preschool_quota | school_quota | price_doc | timestamp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8135 | 53 | 30.0 | 10.0 | 1.0 | 16.0 | 44.0 | 2.0 | 8.0 | 3.0 | Lianozovo | 1.959499 | 1.503698 | 18.420277 | 0.408673 | 0.364994 | 0.875814 | 2.169200 | 11.018216 | 13.270117 | 13.854330 | 15.345902 | 68630 | 0.258663 | 0.101872 | 2703.0 | 7236.0 | 9000000 | 3963 |
| 8154 | 45 | 27.0 | 6.0 | 1.0 | 9.0 | 54.0 | 2.0 | 6.0 | 3.0 | Severnoe Butovo | 1.817706 | 0.827413 | 9.928954 | 0.495571 | 0.183503 | 2.590344 | 1.704870 | 15.482046 | 18.682566 | 20.077081 | 20.728839 | 78616 | 0.579645 | 0.000000 | 3617.0 | 7653.0 | 7100000 | 3961 |
| 8255 | 74 | 46.0 | 12.0 | 1.0 | 24.0 | 20.0 | 3.0 | 9.0 | 3.0 | Juzhnoe Medvedkovo | 3.509942 | 1.887630 | 22.380717 | 0.128857 | 0.140458 | 0.593471 | 2.349730 | 9.120782 | 11.418023 | 12.095526 | 13.529297 | 27992 | 0.157332 | 0.232205 | 2200.0 | 6476.0 | 12100000 | 3958 |
| 8282 | 51 | 30.0 | 7.0 | 1.0 | 17.0 | 21.0 | 2.0 | 9.0 | 3.0 | Solncevo | 2.861107 | 1.682108 | 20.896099 | 0.387735 | 0.827697 | 1.766185 | 2.422245 | 12.755092 | 15.453518 | 16.623270 | 17.455159 | 125111 | 0.083810 | 0.371149 | 3953.0 | 9700.0 | 7700000 | 3956 |
| 8287 | 77 | 50.0 | 3.0 | 2.0 | 5.0 | 67.0 | 3.0 | 8.0 | 2.0 | Ajeroport | 2.003856 | 1.501540 | 10.135706 | 0.128005 | 0.508435 | 0.741615 | 11.246993 | 0.449498 | 3.145335 | 3.872968 | 5.200064 | 1100773 | 0.109947 | 0.050272 | 2058.0 | 4975.0 | 11700000 | 3954 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 30463 | 56 | 29.0 | 13.0 | 1.0 | 14.0 | 23.0 | 2.0 | 11.0 | 3.0 | Severnoe Tushino | 2.622565 | 1.580238 | 8.510351 | 0.225720 | 0.193474 | 1.876243 | 1.486707 | 11.896884 | 14.587359 | 15.339712 | 16.626186 | 53786 | 0.374068 | 0.000000 | 4116.0 | 9891.0 | 12000000 | 3197 |
| 30466 | 44 | 27.0 | 7.0 | 1.0 | 9.0 | 49.0 | 2.0 | 6.0 | 3.0 | Otradnoe | 1.384021 | 0.659002 | 8.158093 | 0.132645 | 0.349899 | 1.972527 | 3.762408 | 8.361875 | 10.543724 | 11.118577 | 12.599292 | 61396 | 0.096270 | 0.300323 | 5088.0 | 12721.0 | 7400000 | 3197 |
| 30467 | 86 | 59.0 | 3.0 | 2.0 | 9.0 | 89.0 | 4.0 | 10.0 | 3.0 | Tverskoe | 1.060577 | 0.781217 | 9.374609 | 0.276256 | 0.362681 | 1.036452 | 13.100989 | 1.238732 | 1.203215 | 1.874868 | 3.269284 | 116742 | 0.065444 | 0.000078 | 1874.0 | 6772.0 | 25000000 | 3197 |
| 30469 | 64 | 32.0 | 5.0 | 1.0 | 15.0 | 21.0 | 2.0 | 11.0 | 2.0 | Obruchevskoe | 3.377814 | 2.047312 | 24.567748 | 0.203020 | 0.130667 | 1.772506 | 2.327138 | 8.940313 | 11.752036 | 12.872535 | 13.622569 | 83844 | 0.167526 | 0.093443 | 2372.0 | 6083.0 | 13500000 | 3197 |
| 30470 | 43 | 28.0 | 1.0 | 1.0 | 9.0 | 56.0 | 2.0 | 6.0 | 2.0 | Novogireevo | 0.584636 | 0.454650 | 5.455795 | 0.093619 | 0.378950 | 0.848766 | 1.920884 | 6.809408 | 9.675169 | 10.228634 | 11.812614 | 72131 | 0.063755 | 0.038693 | 2215.0 | 5824.0 | 5600000 | 3197 |
12662 rows × 28 columns
Построим тепловую карту корреляции между столбцами¶
# Заменим строковые данные на числовые
le = LabelEncoder()
label = le.fit_transform(train_df['sub_area'])
train_df.drop('sub_area', axis=1, inplace=True)
train_df['sub_area'] = label
train_df.sub_area.value_counts()
sub_area
60 346
71 319
38 295
65 260
48 210
...
47 20
114 18
93 15
4 6
116 5
Name: count, Length: 124, dtype: int64
sns.heatmap(train_df.corr(), cmap="YlGnBu", annot=True)
<Axes: >
pd.set_option('display.max_columns', 30)
train_df.corr()
| full_sq | life_sq | floor | material | max_floor | build_year | num_room | kitch_sq | state | metro_min_avto | metro_km_avto | metro_min_walk | kindergarten_km | school_km | park_km | mkad_km | ttk_km | sadovoe_km | bulvar_ring_km | kremlin_km | full_all | green_zone_part | indust_part | preschool_quota | school_quota | price_doc | timestamp | sub_area | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| full_sq | 1.000000 | 0.864517 | 0.189318 | 0.035117 | 0.274122 | -0.278751 | 0.750839 | 0.377070 | -0.028483 | 0.007149 | 0.005909 | 0.007016 | 0.100200 | 0.035749 | 0.004796 | 0.077720 | 0.013391 | -0.003163 | -0.005595 | -0.009325 | -0.011848 | -0.012583 | -0.079593 | 0.004858 | 0.044580 | 0.691355 | -0.042481 | 0.008270 |
| life_sq | 0.864517 | 1.000000 | 0.128771 | 0.033228 | 0.146827 | -0.203463 | 0.693216 | 0.168017 | -0.126236 | -0.017748 | -0.016161 | -0.014376 | 0.116623 | 0.027671 | -0.008896 | 0.073724 | -0.016097 | -0.032301 | -0.036614 | -0.039980 | -0.023296 | -0.013441 | -0.056332 | -0.042248 | 0.005524 | 0.576936 | -0.020679 | 0.025625 |
| floor | 0.189318 | 0.128771 | 1.000000 | 0.033280 | 0.556100 | -0.443583 | 0.003720 | 0.173599 | -0.051380 | 0.030750 | 0.030862 | 0.029687 | 0.088577 | -0.022125 | 0.027717 | -0.029004 | 0.114478 | 0.117351 | 0.117870 | 0.112558 | -0.013001 | 0.002560 | -0.028013 | 0.083587 | 0.092995 | 0.146818 | -0.001655 | 0.008698 |
| material | 0.035117 | 0.033228 | 0.033280 | 1.000000 | 0.059418 | 0.066996 | -0.044157 | 0.076831 | -0.060944 | -0.008901 | -0.001139 | -0.002480 | 0.065271 | 0.063827 | -0.013142 | 0.110607 | -0.123056 | -0.135724 | -0.138435 | -0.135454 | 0.041964 | -0.036681 | 0.013148 | -0.173363 | -0.150087 | 0.074002 | 0.004974 | 0.001462 |
| max_floor | 0.274122 | 0.146827 | 0.556100 | 0.059418 | 1.000000 | -0.652505 | -0.005550 | 0.384772 | -0.000541 | 0.065542 | 0.064770 | 0.062739 | 0.091581 | -0.029464 | 0.053463 | -0.029869 | 0.173209 | 0.178452 | 0.179853 | 0.171547 | 0.009486 | -0.016115 | -0.033329 | 0.126666 | 0.129807 | 0.187443 | -0.007001 | 0.002527 |
| build_year | -0.278751 | -0.203463 | -0.443583 | 0.066996 | -0.652505 | 1.000000 | 0.031552 | -0.218930 | 0.138939 | -0.116226 | -0.116404 | -0.113000 | -0.178505 | 0.006054 | -0.113884 | 0.064431 | -0.313149 | -0.329221 | -0.326671 | -0.319065 | 0.002107 | 0.022828 | 0.012089 | -0.260528 | -0.260072 | -0.118232 | 0.006754 | -0.046289 |
| num_room | 0.750839 | 0.693216 | 0.003720 | -0.044157 | -0.005550 | 0.031552 | 1.000000 | 0.150424 | 0.049339 | -0.000602 | -0.004525 | -0.001984 | 0.030993 | -0.001847 | 0.001296 | 0.043895 | -0.012724 | -0.025387 | -0.026663 | -0.027465 | -0.018244 | 0.009894 | -0.058327 | -0.023724 | 0.000685 | 0.479110 | -0.040553 | 0.005968 |
| kitch_sq | 0.377070 | 0.168017 | 0.173599 | 0.076831 | 0.384772 | -0.218930 | 0.150424 | 1.000000 | 0.196511 | 0.048246 | 0.044392 | 0.043601 | -0.009848 | 0.015718 | 0.028961 | 0.014055 | 0.061710 | 0.061109 | 0.065725 | 0.063095 | 0.024245 | 0.002216 | -0.052059 | 0.091357 | 0.086052 | 0.312810 | -0.018144 | -0.040975 |
| state | -0.028483 | -0.126236 | -0.051380 | -0.060944 | -0.000541 | 0.138939 | 0.049339 | 0.196511 | 1.000000 | 0.021254 | 0.014202 | 0.013389 | -0.119484 | -0.025052 | -0.003169 | -0.023621 | 0.015438 | 0.016682 | 0.024569 | 0.022566 | -0.001948 | 0.032722 | -0.039107 | 0.071744 | 0.048169 | 0.074048 | 0.023145 | -0.072874 |
| metro_min_avto | 0.007149 | -0.017748 | 0.030750 | -0.008901 | 0.065542 | -0.116226 | -0.000602 | 0.048246 | 0.021254 | 1.000000 | 0.991986 | 0.992027 | 0.259406 | 0.006672 | 0.917306 | 0.552688 | 0.784782 | 0.758159 | 0.749082 | 0.753328 | 0.013380 | 0.007148 | 0.113768 | -0.003284 | -0.067959 | -0.118110 | 0.008246 | 0.005465 |
| metro_km_avto | 0.005909 | -0.016161 | 0.030862 | -0.001139 | 0.064770 | -0.116404 | -0.004525 | 0.044392 | 0.014202 | 0.991986 | 1.000000 | 0.995201 | 0.268769 | 0.009618 | 0.916551 | 0.559573 | 0.781466 | 0.754169 | 0.745258 | 0.749158 | 0.028728 | 0.002075 | 0.117591 | -0.004937 | -0.066037 | -0.117760 | 0.008395 | 0.008393 |
| metro_min_walk | 0.007016 | -0.014376 | 0.029687 | -0.002480 | 0.062739 | -0.113000 | -0.001984 | 0.043601 | 0.013389 | 0.992027 | 0.995201 | 1.000000 | 0.265616 | 0.003286 | 0.923493 | 0.564353 | 0.782449 | 0.754915 | 0.745348 | 0.749681 | 0.017928 | 0.005057 | 0.117971 | -0.011292 | -0.073166 | -0.117417 | 0.010236 | 0.018286 |
| kindergarten_km | 0.100200 | 0.116623 | 0.088577 | 0.065271 | 0.091581 | -0.178505 | 0.030993 | -0.009848 | -0.119484 | 0.259406 | 0.268769 | 0.265616 | 1.000000 | 0.160127 | 0.250244 | 0.204851 | 0.241566 | 0.221111 | 0.209809 | 0.206920 | -0.032634 | -0.015839 | 0.077809 | -0.134490 | -0.123451 | 0.008449 | -0.024861 | 0.083400 |
| school_km | 0.035749 | 0.027671 | -0.022125 | 0.063827 | -0.029464 | 0.006054 | -0.001847 | 0.015718 | -0.025052 | 0.006672 | 0.009618 | 0.003286 | 0.160127 | 1.000000 | -0.002631 | 0.091883 | -0.093441 | -0.100354 | -0.097436 | -0.096789 | -0.025123 | -0.028034 | 0.064330 | -0.105982 | -0.092575 | 0.028849 | -0.025070 | -0.027851 |
| park_km | 0.004796 | -0.008896 | 0.027717 | -0.013142 | 0.053463 | -0.113884 | 0.001296 | 0.028961 | -0.003169 | 0.917306 | 0.916551 | 0.923493 | 0.250244 | -0.002631 | 1.000000 | 0.596361 | 0.774143 | 0.751616 | 0.743493 | 0.749066 | 0.011996 | -0.021965 | 0.089671 | 0.001255 | -0.058306 | -0.121468 | 0.010851 | 0.042631 |
| mkad_km | 0.077720 | 0.073724 | -0.029004 | 0.110607 | -0.029869 | 0.064431 | 0.043895 | 0.014055 | -0.023621 | 0.552688 | 0.559573 | 0.564353 | 0.204851 | 0.091883 | 0.596361 | 1.000000 | 0.156230 | 0.094667 | 0.087518 | 0.080267 | -0.029235 | -0.276890 | 0.087960 | -0.256196 | -0.236373 | 0.105964 | -0.002862 | -0.011458 |
| ttk_km | 0.013391 | -0.016097 | 0.114478 | -0.123056 | 0.173209 | -0.313149 | -0.012724 | 0.061710 | 0.015438 | 0.784782 | 0.781466 | 0.782449 | 0.241566 | -0.093441 | 0.774143 | 0.156230 | 1.000000 | 0.989073 | 0.983063 | 0.983573 | -0.022213 | 0.093437 | 0.018336 | 0.328936 | 0.239421 | -0.183299 | 0.013294 | 0.022572 |
| sadovoe_km | -0.003163 | -0.032301 | 0.117351 | -0.135724 | 0.178452 | -0.329221 | -0.025387 | 0.061109 | 0.016682 | 0.758159 | 0.754169 | 0.754915 | 0.221111 | -0.100354 | 0.751616 | 0.094667 | 0.989073 | 1.000000 | 0.998048 | 0.998053 | -0.008304 | 0.101632 | 0.032191 | 0.356299 | 0.260203 | -0.207963 | 0.013045 | 0.021296 |
| bulvar_ring_km | -0.005595 | -0.036614 | 0.117870 | -0.138435 | 0.179853 | -0.326671 | -0.026663 | 0.065725 | 0.024569 | 0.749082 | 0.745258 | 0.745348 | 0.209809 | -0.097436 | 0.743493 | 0.087518 | 0.983063 | 0.998048 | 1.000000 | 0.998093 | -0.002878 | 0.104637 | 0.032559 | 0.368840 | 0.270477 | -0.206398 | 0.011876 | 0.009330 |
| kremlin_km | -0.009325 | -0.039980 | 0.112558 | -0.135454 | 0.171547 | -0.319065 | -0.027465 | 0.063095 | 0.022566 | 0.753328 | 0.749158 | 0.749681 | 0.206920 | -0.096789 | 0.749066 | 0.080267 | 0.983573 | 0.998053 | 0.998093 | 1.000000 | -0.004420 | 0.110159 | 0.039223 | 0.356667 | 0.257996 | -0.211147 | 0.012964 | 0.013025 |
| full_all | -0.011848 | -0.023296 | -0.013001 | 0.041964 | 0.009486 | 0.002107 | -0.018244 | 0.024245 | -0.001948 | 0.013380 | 0.028728 | 0.017928 | -0.032634 | -0.025123 | 0.011996 | -0.029235 | -0.022213 | -0.008304 | -0.002878 | -0.004420 | 1.000000 | 0.137935 | -0.110288 | 0.062147 | 0.083287 | -0.021756 | -0.015010 | -0.230209 |
| green_zone_part | -0.012583 | -0.013441 | 0.002560 | -0.036681 | -0.016115 | 0.022828 | 0.009894 | 0.002216 | 0.032722 | 0.007148 | 0.002075 | 0.005057 | -0.015839 | -0.028034 | -0.021965 | -0.276890 | 0.093437 | 0.101632 | 0.104637 | 0.110159 | 0.137935 | 1.000000 | -0.440444 | 0.084792 | 0.057447 | -0.033283 | 0.012012 | -0.056535 |
| indust_part | -0.079593 | -0.056332 | -0.028013 | 0.013148 | -0.033329 | 0.012089 | -0.058327 | -0.052059 | -0.039107 | 0.113768 | 0.117591 | 0.117971 | 0.077809 | 0.064330 | 0.089671 | 0.087960 | 0.018336 | 0.032191 | 0.032559 | 0.039223 | -0.110288 | -0.440444 | 1.000000 | -0.214741 | -0.245558 | -0.136257 | -0.001335 | -0.077357 |
| preschool_quota | 0.004858 | -0.042248 | 0.083587 | -0.173363 | 0.126666 | -0.260528 | -0.023724 | 0.091357 | 0.071744 | -0.003284 | -0.004937 | -0.011292 | -0.134490 | -0.105982 | 0.001255 | -0.256196 | 0.328936 | 0.356299 | 0.368840 | 0.356667 | 0.062147 | 0.084792 | -0.214741 | 1.000000 | 0.936045 | -0.088240 | 0.011001 | -0.018985 |
| school_quota | 0.044580 | 0.005524 | 0.092995 | -0.150087 | 0.129807 | -0.260072 | 0.000685 | 0.086052 | 0.048169 | -0.067959 | -0.066037 | -0.073166 | -0.123451 | -0.092575 | -0.058306 | -0.236373 | 0.239421 | 0.260203 | 0.270477 | 0.257996 | 0.083287 | 0.057447 | -0.245558 | 0.936045 | 1.000000 | -0.014813 | 0.008242 | -0.031697 |
| price_doc | 0.691355 | 0.576936 | 0.146818 | 0.074002 | 0.187443 | -0.118232 | 0.479110 | 0.312810 | 0.074048 | -0.118110 | -0.117760 | -0.117417 | 0.008449 | 0.028849 | -0.121468 | 0.105964 | -0.183299 | -0.207963 | -0.206398 | -0.211147 | -0.021756 | -0.033283 | -0.136257 | -0.088240 | -0.014813 | 1.000000 | -0.089644 | -0.002547 |
| timestamp | -0.042481 | -0.020679 | -0.001655 | 0.004974 | -0.007001 | 0.006754 | -0.040553 | -0.018144 | 0.023145 | 0.008246 | 0.008395 | 0.010236 | -0.024861 | -0.025070 | 0.010851 | -0.002862 | 0.013294 | 0.013045 | 0.011876 | 0.012964 | -0.015010 | 0.012012 | -0.001335 | 0.011001 | 0.008242 | -0.089644 | 1.000000 | 0.001438 |
| sub_area | 0.008270 | 0.025625 | 0.008698 | 0.001462 | 0.002527 | -0.046289 | 0.005968 | -0.040975 | -0.072874 | 0.005465 | 0.008393 | 0.018286 | 0.083400 | -0.027851 | 0.042631 | -0.011458 | 0.022572 | 0.021296 | 0.009330 | 0.013025 | -0.230209 | -0.056535 | -0.077357 | -0.018985 | -0.031697 | -0.002547 | 0.001438 | 1.000000 |
Вывод:
Столбцы с высокой корреляцией:
(metro_min_avto, metro_km_avto, metro_min_walk, park_km, mkad_km, ttk_km, sadovoe_km, bulvar_ring_km, kremlin_km): оставляем kremlin_km
(kindergarten_km, school_km): оставляем school_km
(preschool_quota, school_quota): оставляем school_quota
non_corr_columns = [
"full_sq",
"life_sq",
"floor",
"material",
"max_floor",
"build_year",
"num_room",
"kitch_sq",
"state",
"sub_area",
"school_km",
"kremlin_km",
"full_all",
"green_zone_part",
"indust_part",
"school_quota",
"hospital_beds_raion",
"timestamp",
"price_doc"
]
train_df = train_df.filter(items=non_corr_columns)
train_df
| full_sq | life_sq | floor | material | max_floor | build_year | num_room | kitch_sq | state | sub_area | school_km | kremlin_km | full_all | green_zone_part | indust_part | school_quota | timestamp | price_doc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8135 | 53 | 30.0 | 10.0 | 1.0 | 16.0 | 44.0 | 2.0 | 8.0 | 3.0 | 55 | 0.364994 | 15.345902 | 68630 | 0.258663 | 0.101872 | 7236.0 | 3963 | 9000000 |
| 8154 | 45 | 27.0 | 6.0 | 1.0 | 9.0 | 54.0 | 2.0 | 6.0 | 3.0 | 94 | 0.183503 | 20.728839 | 78616 | 0.579645 | 0.000000 | 7653.0 | 3961 | 7100000 |
| 8255 | 74 | 46.0 | 12.0 | 1.0 | 24.0 | 20.0 | 3.0 | 9.0 | 3.0 | 39 | 0.140458 | 13.529297 | 27992 | 0.157332 | 0.232205 | 6476.0 | 3958 | 12100000 |
| 8282 | 51 | 30.0 | 7.0 | 1.0 | 17.0 | 21.0 | 2.0 | 9.0 | 3.0 | 103 | 0.827697 | 17.455159 | 125111 | 0.083810 | 0.371149 | 9700.0 | 3956 | 7700000 |
| 8287 | 77 | 50.0 | 3.0 | 2.0 | 5.0 | 67.0 | 3.0 | 8.0 | 2.0 | 0 | 0.508435 | 5.200064 | 1100773 | 0.109947 | 0.050272 | 4975.0 | 3954 | 11700000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 30463 | 56 | 29.0 | 13.0 | 1.0 | 14.0 | 23.0 | 2.0 | 11.0 | 3.0 | 97 | 0.193474 | 16.626186 | 53786 | 0.374068 | 0.000000 | 9891.0 | 3197 | 12000000 |
| 30466 | 44 | 27.0 | 7.0 | 1.0 | 9.0 | 49.0 | 2.0 | 6.0 | 3.0 | 81 | 0.349899 | 12.599292 | 61396 | 0.096270 | 0.300323 | 12721.0 | 3197 | 7400000 |
| 30467 | 86 | 59.0 | 3.0 | 2.0 | 9.0 | 89.0 | 4.0 | 10.0 | 3.0 | 112 | 0.362681 | 3.269284 | 116742 | 0.065444 | 0.000078 | 6772.0 | 3197 | 25000000 |
| 30469 | 64 | 32.0 | 5.0 | 1.0 | 15.0 | 21.0 | 2.0 | 11.0 | 2.0 | 76 | 0.130667 | 13.622569 | 83844 | 0.167526 | 0.093443 | 6083.0 | 3197 | 13500000 |
| 30470 | 43 | 28.0 | 1.0 | 1.0 | 9.0 | 56.0 | 2.0 | 6.0 | 2.0 | 74 | 0.378950 | 11.812614 | 72131 | 0.063755 | 0.038693 | 5824.0 | 3197 | 5600000 |
12662 rows × 18 columns
sns.heatmap(train_df.corr(), cmap="YlGnBu", annot=True)
<Axes: >
train_df.sub_area.value_counts()
sub_area
60 346
71 319
38 295
65 260
48 210
...
47 20
114 18
93 15
4 6
116 5
Name: count, Length: 124, dtype: int64
train_df.school_quota.value_counts()
school_quota
21892.0 346
7377.0 319
24750.0 295
17063.0 260
10529.0 210
...
4091.0 20
2231.0 18
3298.0 15
5155.0 6
1924.0 5
Name: count, Length: 124, dtype: int64
train_df.full_all.value_counts()
full_all
165727 346
247469 319
102618 295
21155 260
221709 210
...
55590 20
1285626 18
122873 15
741887 6
76308 5
Name: count, Length: 124, dtype: int64
train_df.green_zone_part.value_counts()
green_zone_part
0.188713 346
0.055644 319
0.137846 295
0.194703 260
0.062172 210
...
0.011048 20
0.051549 18
0.146447 15
0.009851 6
0.081410 5
Name: count, Length: 124, dtype: int64
train_df.indust_part.value_counts()
indust_part
0.000000 1065
0.090799 346
0.243205 319
0.041116 295
0.069753 260
...
0.407276 23
0.246624 20
0.036270 18
0.197368 15
0.394094 5
Name: count, Length: 111, dtype: int64
train_df.school_km.value_counts()
school_km
0.214197 176
0.269716 100
0.249131 51
0.480735 35
0.000000 30
...
0.276716 1
0.736226 1
0.829722 1
0.268290 1
0.130667 1
Name: count, Length: 8058, dtype: int64
train_df.kremlin_km.value_counts()
kremlin_km
20.549464 176
0.072897 100
18.752843 51
19.691904 35
15.869044 28
...
15.323615 1
14.099051 1
6.281530 1
15.170973 1
13.622569 1
Name: count, Length: 8079, dtype: int64
столбцы school_quota, full_all, green_zone_part, indust_part, принимает значения в сависимости от значения столбца sub_area, поэтомму его тоже можно удалить
столбец school_km принимает значение в зависимости от значения столбца kremlin_km, поэтому его можно удалить
train_df = train_df.drop(columns=['school_quota', 'full_all', 'green_zone_part', 'indust_part', 'school_km'])
train_df
| full_sq | life_sq | floor | material | max_floor | build_year | num_room | kitch_sq | state | sub_area | kremlin_km | timestamp | price_doc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8135 | 53 | 30.0 | 10.0 | 1.0 | 16.0 | 44.0 | 2.0 | 8.0 | 3.0 | 55 | 15.345902 | 3963 | 9000000 |
| 8154 | 45 | 27.0 | 6.0 | 1.0 | 9.0 | 54.0 | 2.0 | 6.0 | 3.0 | 94 | 20.728839 | 3961 | 7100000 |
| 8255 | 74 | 46.0 | 12.0 | 1.0 | 24.0 | 20.0 | 3.0 | 9.0 | 3.0 | 39 | 13.529297 | 3958 | 12100000 |
| 8282 | 51 | 30.0 | 7.0 | 1.0 | 17.0 | 21.0 | 2.0 | 9.0 | 3.0 | 103 | 17.455159 | 3956 | 7700000 |
| 8287 | 77 | 50.0 | 3.0 | 2.0 | 5.0 | 67.0 | 3.0 | 8.0 | 2.0 | 0 | 5.200064 | 3954 | 11700000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 30463 | 56 | 29.0 | 13.0 | 1.0 | 14.0 | 23.0 | 2.0 | 11.0 | 3.0 | 97 | 16.626186 | 3197 | 12000000 |
| 30466 | 44 | 27.0 | 7.0 | 1.0 | 9.0 | 49.0 | 2.0 | 6.0 | 3.0 | 81 | 12.599292 | 3197 | 7400000 |
| 30467 | 86 | 59.0 | 3.0 | 2.0 | 9.0 | 89.0 | 4.0 | 10.0 | 3.0 | 112 | 3.269284 | 3197 | 25000000 |
| 30469 | 64 | 32.0 | 5.0 | 1.0 | 15.0 | 21.0 | 2.0 | 11.0 | 2.0 | 76 | 13.622569 | 3197 | 13500000 |
| 30470 | 43 | 28.0 | 1.0 | 1.0 | 9.0 | 56.0 | 2.0 | 6.0 | 2.0 | 74 | 11.812614 | 3197 | 5600000 |
12662 rows × 13 columns
sns.heatmap(train_df.corr(), cmap="YlGnBu", annot=True)
<Axes: >
train_df.corr()
| full_sq | life_sq | floor | material | max_floor | build_year | num_room | kitch_sq | state | sub_area | kremlin_km | timestamp | price_doc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| full_sq | 1.000000 | 0.864517 | 0.189318 | 0.035117 | 0.274122 | -0.278751 | 0.750839 | 0.377070 | -0.028483 | 0.008270 | -0.009325 | -0.042481 | 0.691355 |
| life_sq | 0.864517 | 1.000000 | 0.128771 | 0.033228 | 0.146827 | -0.203463 | 0.693216 | 0.168017 | -0.126236 | 0.025625 | -0.039980 | -0.020679 | 0.576936 |
| floor | 0.189318 | 0.128771 | 1.000000 | 0.033280 | 0.556100 | -0.443583 | 0.003720 | 0.173599 | -0.051380 | 0.008698 | 0.112558 | -0.001655 | 0.146818 |
| material | 0.035117 | 0.033228 | 0.033280 | 1.000000 | 0.059418 | 0.066996 | -0.044157 | 0.076831 | -0.060944 | 0.001462 | -0.135454 | 0.004974 | 0.074002 |
| max_floor | 0.274122 | 0.146827 | 0.556100 | 0.059418 | 1.000000 | -0.652505 | -0.005550 | 0.384772 | -0.000541 | 0.002527 | 0.171547 | -0.007001 | 0.187443 |
| build_year | -0.278751 | -0.203463 | -0.443583 | 0.066996 | -0.652505 | 1.000000 | 0.031552 | -0.218930 | 0.138939 | -0.046289 | -0.319065 | 0.006754 | -0.118232 |
| num_room | 0.750839 | 0.693216 | 0.003720 | -0.044157 | -0.005550 | 0.031552 | 1.000000 | 0.150424 | 0.049339 | 0.005968 | -0.027465 | -0.040553 | 0.479110 |
| kitch_sq | 0.377070 | 0.168017 | 0.173599 | 0.076831 | 0.384772 | -0.218930 | 0.150424 | 1.000000 | 0.196511 | -0.040975 | 0.063095 | -0.018144 | 0.312810 |
| state | -0.028483 | -0.126236 | -0.051380 | -0.060944 | -0.000541 | 0.138939 | 0.049339 | 0.196511 | 1.000000 | -0.072874 | 0.022566 | 0.023145 | 0.074048 |
| sub_area | 0.008270 | 0.025625 | 0.008698 | 0.001462 | 0.002527 | -0.046289 | 0.005968 | -0.040975 | -0.072874 | 1.000000 | 0.013025 | 0.001438 | -0.002547 |
| kremlin_km | -0.009325 | -0.039980 | 0.112558 | -0.135454 | 0.171547 | -0.319065 | -0.027465 | 0.063095 | 0.022566 | 0.013025 | 1.000000 | 0.012964 | -0.211147 |
| timestamp | -0.042481 | -0.020679 | -0.001655 | 0.004974 | -0.007001 | 0.006754 | -0.040553 | -0.018144 | 0.023145 | 0.001438 | 0.012964 | 1.000000 | -0.089644 |
| price_doc | 0.691355 | 0.576936 | 0.146818 | 0.074002 | 0.187443 | -0.118232 | 0.479110 | 0.312810 | 0.074048 | -0.002547 | -0.211147 | -0.089644 | 1.000000 |
Стобцы full_sq и life_sq сильно коррелированы, поэтому столбец life_sq можно удалить
train_df = train_df.drop(columns='life_sq')
train_df
| full_sq | floor | material | max_floor | build_year | num_room | kitch_sq | state | sub_area | kremlin_km | timestamp | price_doc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8135 | 53 | 10.0 | 1.0 | 16.0 | 44.0 | 2.0 | 8.0 | 3.0 | 55 | 15.345902 | 3963 | 9000000 |
| 8154 | 45 | 6.0 | 1.0 | 9.0 | 54.0 | 2.0 | 6.0 | 3.0 | 94 | 20.728839 | 3961 | 7100000 |
| 8255 | 74 | 12.0 | 1.0 | 24.0 | 20.0 | 3.0 | 9.0 | 3.0 | 39 | 13.529297 | 3958 | 12100000 |
| 8282 | 51 | 7.0 | 1.0 | 17.0 | 21.0 | 2.0 | 9.0 | 3.0 | 103 | 17.455159 | 3956 | 7700000 |
| 8287 | 77 | 3.0 | 2.0 | 5.0 | 67.0 | 3.0 | 8.0 | 2.0 | 0 | 5.200064 | 3954 | 11700000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 30463 | 56 | 13.0 | 1.0 | 14.0 | 23.0 | 2.0 | 11.0 | 3.0 | 97 | 16.626186 | 3197 | 12000000 |
| 30466 | 44 | 7.0 | 1.0 | 9.0 | 49.0 | 2.0 | 6.0 | 3.0 | 81 | 12.599292 | 3197 | 7400000 |
| 30467 | 86 | 3.0 | 2.0 | 9.0 | 89.0 | 4.0 | 10.0 | 3.0 | 112 | 3.269284 | 3197 | 25000000 |
| 30469 | 64 | 5.0 | 1.0 | 15.0 | 21.0 | 2.0 | 11.0 | 2.0 | 76 | 13.622569 | 3197 | 13500000 |
| 30470 | 43 | 1.0 | 1.0 | 9.0 | 56.0 | 2.0 | 6.0 | 2.0 | 74 | 11.812614 | 3197 | 5600000 |
12662 rows × 12 columns
sns.heatmap(train_df.corr(), cmap="YlGnBu", annot=True)
<Axes: >
train_df
| full_sq | floor | material | max_floor | build_year | num_room | kitch_sq | state | sub_area | kremlin_km | timestamp | price_doc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8135 | 53 | 10.0 | 1.0 | 16.0 | 44.0 | 2.0 | 8.0 | 3.0 | 55 | 15.345902 | 3963 | 9000000 |
| 8154 | 45 | 6.0 | 1.0 | 9.0 | 54.0 | 2.0 | 6.0 | 3.0 | 94 | 20.728839 | 3961 | 7100000 |
| 8255 | 74 | 12.0 | 1.0 | 24.0 | 20.0 | 3.0 | 9.0 | 3.0 | 39 | 13.529297 | 3958 | 12100000 |
| 8282 | 51 | 7.0 | 1.0 | 17.0 | 21.0 | 2.0 | 9.0 | 3.0 | 103 | 17.455159 | 3956 | 7700000 |
| 8287 | 77 | 3.0 | 2.0 | 5.0 | 67.0 | 3.0 | 8.0 | 2.0 | 0 | 5.200064 | 3954 | 11700000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 30463 | 56 | 13.0 | 1.0 | 14.0 | 23.0 | 2.0 | 11.0 | 3.0 | 97 | 16.626186 | 3197 | 12000000 |
| 30466 | 44 | 7.0 | 1.0 | 9.0 | 49.0 | 2.0 | 6.0 | 3.0 | 81 | 12.599292 | 3197 | 7400000 |
| 30467 | 86 | 3.0 | 2.0 | 9.0 | 89.0 | 4.0 | 10.0 | 3.0 | 112 | 3.269284 | 3197 | 25000000 |
| 30469 | 64 | 5.0 | 1.0 | 15.0 | 21.0 | 2.0 | 11.0 | 2.0 | 76 | 13.622569 | 3197 | 13500000 |
| 30470 | 43 | 1.0 | 1.0 | 9.0 | 56.0 | 2.0 | 6.0 | 2.0 | 74 | 11.812614 | 3197 | 5600000 |
12662 rows × 12 columns
Выявление аномалий¶
Для начала удалим неположительные числа там, где их быть не должно
train_df = train_df[(train_df['full_sq'] > 0) & (train_df['floor'] > 0) & (train_df['max_floor'] > 0) & (train_df['num_room'] > 0) & (train_df['kitch_sq'] > 0) & (train_df['kremlin_km'] > 0)]
train_df
| full_sq | floor | material | max_floor | build_year | num_room | kitch_sq | state | sub_area | kremlin_km | timestamp | price_doc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8135 | 53 | 10.0 | 1.0 | 16.0 | 44.0 | 2.0 | 8.0 | 3.0 | 55 | 15.345902 | 3963 | 9000000 |
| 8154 | 45 | 6.0 | 1.0 | 9.0 | 54.0 | 2.0 | 6.0 | 3.0 | 94 | 20.728839 | 3961 | 7100000 |
| 8255 | 74 | 12.0 | 1.0 | 24.0 | 20.0 | 3.0 | 9.0 | 3.0 | 39 | 13.529297 | 3958 | 12100000 |
| 8282 | 51 | 7.0 | 1.0 | 17.0 | 21.0 | 2.0 | 9.0 | 3.0 | 103 | 17.455159 | 3956 | 7700000 |
| 8287 | 77 | 3.0 | 2.0 | 5.0 | 67.0 | 3.0 | 8.0 | 2.0 | 0 | 5.200064 | 3954 | 11700000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 30463 | 56 | 13.0 | 1.0 | 14.0 | 23.0 | 2.0 | 11.0 | 3.0 | 97 | 16.626186 | 3197 | 12000000 |
| 30466 | 44 | 7.0 | 1.0 | 9.0 | 49.0 | 2.0 | 6.0 | 3.0 | 81 | 12.599292 | 3197 | 7400000 |
| 30467 | 86 | 3.0 | 2.0 | 9.0 | 89.0 | 4.0 | 10.0 | 3.0 | 112 | 3.269284 | 3197 | 25000000 |
| 30469 | 64 | 5.0 | 1.0 | 15.0 | 21.0 | 2.0 | 11.0 | 2.0 | 76 | 13.622569 | 3197 | 13500000 |
| 30470 | 43 | 1.0 | 1.0 | 9.0 | 56.0 | 2.0 | 6.0 | 2.0 | 74 | 11.812614 | 3197 | 5600000 |
12597 rows × 12 columns
Удалим строки, в которых значение стобца state не принадлежит отрезку [1, 5]
train_df = train_df[(train_df['state'] >= 1) & (train_df['state'] <= 5)]
train_df
| full_sq | floor | material | max_floor | build_year | num_room | kitch_sq | state | sub_area | kremlin_km | timestamp | price_doc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8135 | 53 | 10.0 | 1.0 | 16.0 | 44.0 | 2.0 | 8.0 | 3.0 | 55 | 15.345902 | 3963 | 9000000 |
| 8154 | 45 | 6.0 | 1.0 | 9.0 | 54.0 | 2.0 | 6.0 | 3.0 | 94 | 20.728839 | 3961 | 7100000 |
| 8255 | 74 | 12.0 | 1.0 | 24.0 | 20.0 | 3.0 | 9.0 | 3.0 | 39 | 13.529297 | 3958 | 12100000 |
| 8282 | 51 | 7.0 | 1.0 | 17.0 | 21.0 | 2.0 | 9.0 | 3.0 | 103 | 17.455159 | 3956 | 7700000 |
| 8287 | 77 | 3.0 | 2.0 | 5.0 | 67.0 | 3.0 | 8.0 | 2.0 | 0 | 5.200064 | 3954 | 11700000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 30463 | 56 | 13.0 | 1.0 | 14.0 | 23.0 | 2.0 | 11.0 | 3.0 | 97 | 16.626186 | 3197 | 12000000 |
| 30466 | 44 | 7.0 | 1.0 | 9.0 | 49.0 | 2.0 | 6.0 | 3.0 | 81 | 12.599292 | 3197 | 7400000 |
| 30467 | 86 | 3.0 | 2.0 | 9.0 | 89.0 | 4.0 | 10.0 | 3.0 | 112 | 3.269284 | 3197 | 25000000 |
| 30469 | 64 | 5.0 | 1.0 | 15.0 | 21.0 | 2.0 | 11.0 | 2.0 | 76 | 13.622569 | 3197 | 13500000 |
| 30470 | 43 | 1.0 | 1.0 | 9.0 | 56.0 | 2.0 | 6.0 | 2.0 | 74 | 11.812614 | 3197 | 5600000 |
12597 rows × 12 columns
Построим ящики с усами для каждого столбца
sns.boxplot(train_df['full_sq']) # Очень много выбросов
<Axes: ylabel='full_sq'>
sns.boxplot(train_df['floor']) # нет аномалий, но много выбросов
<Axes: ylabel='floor'>
sns.boxplot(train_df['material'])
<Axes: ylabel='material'>
plt.hist(train_df['material'])
(array([7928., 0., 2483., 0., 0., 0., 569., 0., 1382.,
235.]),
array([1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. , 5.5, 6. ]),
<BarContainer object of 10 artists>)
train_df.material.value_counts()
material 1.0 7928 2.0 2483 5.0 1382 4.0 569 6.0 235 Name: count, dtype: int64
sns.boxplot(train_df['max_floor']) #реальные значения, но есть выбросы
<Axes: ylabel='max_floor'>
sns.boxplot(train_df['build_year']) # выбросы
<Axes: ylabel='build_year'>
sns.boxplot(train_df['num_room']) #выбросы
<Axes: ylabel='num_room'>
sns.boxplot(train_df['kitch_sq']) #много выбросов
<Axes: ylabel='kitch_sq'>
sns.boxplot(train_df['state'])
<Axes: ylabel='state'>
sns.boxplot(train_df['sub_area'])
<Axes: ylabel='sub_area'>
sns.boxplot(train_df['kremlin_km']) # много выбросов
<Axes: ylabel='kremlin_km'>
sns.boxplot(train_df['timestamp'])
<Axes: ylabel='timestamp'>
sns.boxplot(train_df['price_doc']) # много выбросов
<Axes: ylabel='price_doc'>
Удаляем выбросы методом межквартильного дипазона
for col in train_df.columns:
Q3 = np.quantile(train_df[col], 0.75)
Q1 = np.quantile(train_df[col], 0.25)
IQR = Q3 - Q1
lower_range = Q1 - 1.5 * IQR
upper_range = Q3 + 1.5 * IQR
train_df = train_df[(train_df[col] >= lower_range) & (train_df[col] <= upper_range)]
train_df
| full_sq | floor | material | max_floor | build_year | num_room | kitch_sq | state | sub_area | kremlin_km | timestamp | price_doc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8135 | 53 | 10.0 | 1.0 | 16.0 | 44.0 | 2.0 | 8.0 | 3.0 | 55 | 15.345902 | 3963 | 9000000 |
| 8154 | 45 | 6.0 | 1.0 | 9.0 | 54.0 | 2.0 | 6.0 | 3.0 | 94 | 20.728839 | 3961 | 7100000 |
| 8255 | 74 | 12.0 | 1.0 | 24.0 | 20.0 | 3.0 | 9.0 | 3.0 | 39 | 13.529297 | 3958 | 12100000 |
| 8282 | 51 | 7.0 | 1.0 | 17.0 | 21.0 | 2.0 | 9.0 | 3.0 | 103 | 17.455159 | 3956 | 7700000 |
| 8287 | 77 | 3.0 | 2.0 | 5.0 | 67.0 | 3.0 | 8.0 | 2.0 | 0 | 5.200064 | 3954 | 11700000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 30461 | 36 | 5.0 | 1.0 | 16.0 | 44.0 | 1.0 | 8.0 | 3.0 | 76 | 10.516156 | 3197 | 5000000 |
| 30463 | 56 | 13.0 | 1.0 | 14.0 | 23.0 | 2.0 | 11.0 | 3.0 | 97 | 16.626186 | 3197 | 12000000 |
| 30466 | 44 | 7.0 | 1.0 | 9.0 | 49.0 | 2.0 | 6.0 | 3.0 | 81 | 12.599292 | 3197 | 7400000 |
| 30469 | 64 | 5.0 | 1.0 | 15.0 | 21.0 | 2.0 | 11.0 | 2.0 | 76 | 13.622569 | 3197 | 13500000 |
| 30470 | 43 | 1.0 | 1.0 | 9.0 | 56.0 | 2.0 | 6.0 | 2.0 | 74 | 11.812614 | 3197 | 5600000 |
9080 rows × 12 columns
Проверяем сбалансированность¶
sns.violinplot(train_df.full_sq)
<Axes: ylabel='full_sq'>
sns.violinplot(train_df.floor)
<Axes: ylabel='floor'>
sns.violinplot(train_df.material)
<Axes: ylabel='material'>
sns.violinplot(train_df.max_floor)
<Axes: ylabel='max_floor'>
sns.violinplot(train_df.build_year)
<Axes: ylabel='build_year'>
sns.violinplot(train_df.num_room)
<Axes: ylabel='num_room'>
sns.violinplot(train_df.kitch_sq)
<Axes: ylabel='kitch_sq'>
sns.violinplot(train_df.state)
<Axes: ylabel='state'>
sns.violinplot(train_df.sub_area)
<Axes: ylabel='sub_area'>
sns.violinplot(train_df.kremlin_km)
<Axes: ylabel='kremlin_km'>
sns.violinplot(train_df.timestamp)
<Axes: ylabel='timestamp'>
plt.hist(train_df['price_doc'])
(array([ 452., 623., 356., 1760., 2279., 1448., 809., 614., 450.,
289.]),
array([ 500000., 1910000., 3320000., 4730000., 6140000., 7550000.,
8960000., 10370000., 11780000., 13190000., 14600000.]),
<BarContainer object of 10 artists>)
sns.violinplot(train_df['price_doc'])
<Axes: ylabel='price_doc'>
Отбор признаков¶
sns.pairplot(train_df, hue='price_doc')
<seaborn.axisgrid.PairGrid at 0x285d4c43310>
Столбцы material и timestamp практически не влияют на целевую переменную
train_df = train_df.drop(columns=['material', 'timestamp'])
train_df
| full_sq | floor | max_floor | build_year | num_room | kitch_sq | state | sub_area | kremlin_km | price_doc | |
|---|---|---|---|---|---|---|---|---|---|---|
| 8135 | 53 | 10.0 | 16.0 | 44.0 | 2.0 | 8.0 | 3.0 | 55 | 15.345902 | 9000000 |
| 8154 | 45 | 6.0 | 9.0 | 54.0 | 2.0 | 6.0 | 3.0 | 94 | 20.728839 | 7100000 |
| 8255 | 74 | 12.0 | 24.0 | 20.0 | 3.0 | 9.0 | 3.0 | 39 | 13.529297 | 12100000 |
| 8282 | 51 | 7.0 | 17.0 | 21.0 | 2.0 | 9.0 | 3.0 | 103 | 17.455159 | 7700000 |
| 8287 | 77 | 3.0 | 5.0 | 67.0 | 3.0 | 8.0 | 2.0 | 0 | 5.200064 | 11700000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 30461 | 36 | 5.0 | 16.0 | 44.0 | 1.0 | 8.0 | 3.0 | 76 | 10.516156 | 5000000 |
| 30463 | 56 | 13.0 | 14.0 | 23.0 | 2.0 | 11.0 | 3.0 | 97 | 16.626186 | 12000000 |
| 30466 | 44 | 7.0 | 9.0 | 49.0 | 2.0 | 6.0 | 3.0 | 81 | 12.599292 | 7400000 |
| 30469 | 64 | 5.0 | 15.0 | 21.0 | 2.0 | 11.0 | 2.0 | 76 | 13.622569 | 13500000 |
| 30470 | 43 | 1.0 | 9.0 | 56.0 | 2.0 | 6.0 | 2.0 | 74 | 11.812614 | 5600000 |
9080 rows × 10 columns
train_df
| full_sq | floor | max_floor | build_year | num_room | kitch_sq | state | sub_area | kremlin_km | price_doc | |
|---|---|---|---|---|---|---|---|---|---|---|
| 8135 | 53 | 10.0 | 16.0 | 44.0 | 2.0 | 8.0 | 3.0 | 55 | 15.345902 | 9000000 |
| 8154 | 45 | 6.0 | 9.0 | 54.0 | 2.0 | 6.0 | 3.0 | 94 | 20.728839 | 7100000 |
| 8255 | 74 | 12.0 | 24.0 | 20.0 | 3.0 | 9.0 | 3.0 | 39 | 13.529297 | 12100000 |
| 8282 | 51 | 7.0 | 17.0 | 21.0 | 2.0 | 9.0 | 3.0 | 103 | 17.455159 | 7700000 |
| 8287 | 77 | 3.0 | 5.0 | 67.0 | 3.0 | 8.0 | 2.0 | 0 | 5.200064 | 11700000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 30461 | 36 | 5.0 | 16.0 | 44.0 | 1.0 | 8.0 | 3.0 | 76 | 10.516156 | 5000000 |
| 30463 | 56 | 13.0 | 14.0 | 23.0 | 2.0 | 11.0 | 3.0 | 97 | 16.626186 | 12000000 |
| 30466 | 44 | 7.0 | 9.0 | 49.0 | 2.0 | 6.0 | 3.0 | 81 | 12.599292 | 7400000 |
| 30469 | 64 | 5.0 | 15.0 | 21.0 | 2.0 | 11.0 | 2.0 | 76 | 13.622569 | 13500000 |
| 30470 | 43 | 1.0 | 9.0 | 56.0 | 2.0 | 6.0 | 2.0 | 74 | 11.812614 | 5600000 |
9080 rows × 10 columns
Статистики итогового датасета¶
train_df.describe()
| full_sq | floor | max_floor | build_year | num_room | kitch_sq | state | sub_area | kremlin_km | price_doc | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 9080.000000 | 9080.000000 | 9080.000000 | 9080.000000 | 9080.000000 | 9080.000000 | 9080.000000 | 9080.000000 | 9080.000000 | 9.080000e+03 |
| mean | 48.982269 | 6.343502 | 11.439758 | 44.225110 | 1.902863 | 7.014427 | 2.429295 | 60.411233 | 12.669635 | 7.125241e+06 |
| std | 14.368884 | 4.210085 | 4.945000 | 17.136005 | 0.783288 | 2.294755 | 0.632891 | 34.395134 | 4.472484 | 2.991534e+06 |
| min | 12.000000 | 1.000000 | 1.000000 | 9.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.072897 | 5.000000e+05 |
| 25% | 38.000000 | 3.000000 | 9.000000 | 30.000000 | 1.000000 | 5.000000 | 2.000000 | 32.000000 | 9.602983 | 5.550000e+06 |
| 50% | 45.000000 | 5.000000 | 12.000000 | 48.000000 | 2.000000 | 7.000000 | 2.000000 | 60.000000 | 12.970416 | 6.900000e+06 |
| 75% | 58.000000 | 9.000000 | 16.000000 | 57.000000 | 2.000000 | 8.000000 | 3.000000 | 86.000000 | 15.483092 | 8.800000e+06 |
| max | 95.000000 | 18.000000 | 25.000000 | 97.000000 | 6.000000 | 15.000000 | 4.000000 | 123.000000 | 25.254369 | 1.460000e+07 |
Вернем строковые значения районам для удобства¶
label = le.inverse_transform(train_df.sub_area)
train_df['sub_area'] = label
train_df
| full_sq | floor | max_floor | build_year | num_room | kitch_sq | state | sub_area | kremlin_km | price_doc | |
|---|---|---|---|---|---|---|---|---|---|---|
| 8135 | 53 | 10.0 | 16.0 | 44.0 | 2.0 | 8.0 | 3.0 | Lianozovo | 15.345902 | 9000000 |
| 8154 | 45 | 6.0 | 9.0 | 54.0 | 2.0 | 6.0 | 3.0 | Severnoe Butovo | 20.728839 | 7100000 |
| 8255 | 74 | 12.0 | 24.0 | 20.0 | 3.0 | 9.0 | 3.0 | Juzhnoe Medvedkovo | 13.529297 | 12100000 |
| 8282 | 51 | 7.0 | 17.0 | 21.0 | 2.0 | 9.0 | 3.0 | Solncevo | 17.455159 | 7700000 |
| 8287 | 77 | 3.0 | 5.0 | 67.0 | 3.0 | 8.0 | 2.0 | Ajeroport | 5.200064 | 11700000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 30461 | 36 | 5.0 | 16.0 | 44.0 | 1.0 | 8.0 | 3.0 | Obruchevskoe | 10.516156 | 5000000 |
| 30463 | 56 | 13.0 | 14.0 | 23.0 | 2.0 | 11.0 | 3.0 | Severnoe Tushino | 16.626186 | 12000000 |
| 30466 | 44 | 7.0 | 9.0 | 49.0 | 2.0 | 6.0 | 3.0 | Otradnoe | 12.599292 | 7400000 |
| 30469 | 64 | 5.0 | 15.0 | 21.0 | 2.0 | 11.0 | 2.0 | Obruchevskoe | 13.622569 | 13500000 |
| 30470 | 43 | 1.0 | 9.0 | 56.0 | 2.0 | 6.0 | 2.0 | Novogireevo | 11.812614 | 5600000 |
9080 rows × 10 columns
Сравним полученные данные с реальными современными значениями¶
train_df.sample()
| full_sq | floor | max_floor | build_year | num_room | kitch_sq | state | sub_area | kremlin_km | price_doc | |
|---|---|---|---|---|---|---|---|---|---|---|
| 24348 | 54 | 1.0 | 5.0 | 66.0 | 2.0 | 7.0 | 3.0 | Perovo | 9.549202 | 6750000 |
Получили значение:
Площадь - 54
Этаж - 1
Максимум этажей - 5
Дом построен 66 лет назад
Две комнаты
Площадь кухни - 7
Состояние - 3
Район - Перово
Цена - 6750000
Найден похожий вариант в сервисе подбора недвижимости Циан: https://www.cian.ru/sale/flat/297573553/ Цена 16200000
train_df.sample()
| full_sq | floor | max_floor | build_year | num_room | kitch_sq | state | sub_area | kremlin_km | price_doc | |
|---|---|---|---|---|---|---|---|---|---|---|
| 8825 | 59 | 5.0 | 9.0 | 55.0 | 3.0 | 6.0 | 3.0 | Kon'kovo | 12.554028 | 10000000 |
Получили значение:
Площадь - 59
Этаж - 5
Максимум этажей - 9
Дом построен 55 лет назад
Три комнаты
Площадь кухни - 6
Состояние - 3
Район - Коньково
Цена - 10000000
Найден похожий вариант в сервисе подбора недвижимости Циан: https://www.cian.ru/sale/flat/299026764/ Цена 16500000
Вывод¶
В ходе анализа данных мы получили набор, в котором нет пропусков, аномалий, выбросов и лишних столбцов. Но данные устарели и не могут быть использованы для построения регрессионной модели для предсказания современных цен на недвижимость